library(tidyverse)
library(GGally)
library(genefilter)
library(reshape2)
library(data.table)
library(ggpmisc)
library(RColorBrewer)
library(plotly)
library(DT)
library(GenomicRanges)

# Directory structure
github_dir <- file.path("G:/Shared drives/Nord Lab - Computational Projects/MPRA/STAR408")


setwd(github_dir)


# Global R markdown code chunk options
knitr::opts_chunk$set(message=FALSE, 
                      warning = FALSE, 
                      error=FALSE, 
                      echo=TRUE, 
                      cache=FALSE, 
                      fig.width = 7, fig.height = 6, 
                      fig.align = 'left')

1. Read data and metadata

# This script also confirms duplicate removal.

# bed file
bed <- read.table("bed_files/Combined_408_ASD30-1000_GRCh38_FORALLELES_withoutchr.bed")
colnames(bed) <- c("Chr", "Start", "End", "Amp_name")
#head(bed)

amp_counts_with_duplicates <- data.frame(
  bed,
  "High35-L4-cDNA" = read.table("bwa_mem_amp_counts/STARR-High35-L4-cDNA_S81_L006.sv_counts.txt"),
  "L1-gDNA_S70" = read.table("bwa_mem_amp_counts/STARR-L1-gDNA_S70_L006.sv_counts.txt"),
  "L2-gDNA_S71" = read.table("bwa_mem_amp_counts/STARR-L2-gDNA_S71_L006.sv_counts.txt"),
  "L3-gDNA_S72" = read.table("bwa_mem_amp_counts/STARR-L3-gDNA_S72_L006.sv_counts.txt"),
  "L4-gDNA_S73" = read.table("bwa_mem_amp_counts/STARR-L4-gDNA_S73_L006.sv_counts.txt"),
  "Maxiprep_S80" = read.table("bwa_mem_amp_counts/STARR-Maxiprep_S80_L006.sv_counts.txt"),
  "NewL4-cDNA_S78" = read.table("bwa_mem_amp_counts/STARR-NewL4-cDNA_S78_L006.sv_counts.txt"),
  "OldL1-cDNA_S75" = read.table("bwa_mem_amp_counts/STARR-OldL1-cDNA_S75_L006.sv_counts.txt"),
  "OldL2-cDNA_S76" = read.table("bwa_mem_amp_counts/STARR-OldL2-cDNA_S76_L006.sv_counts.txt"),
  "OldL3-cDNA_S77" = read.table("bwa_mem_amp_counts/STARR-OldL3-cDNA_S77_L006.sv_counts.txt"),
  "OldR1-cDNA_S79" = read.table("bwa_mem_amp_counts/STARR-OldR1-cDNA_S79_L006.sv_counts.txt"),
  "R1-gDNA_S74" = read.table("bwa_mem_amp_counts/STARR-R1-gDNA_S74_L006.sv_counts.txt")
)

colnames(amp_counts_with_duplicates) <- c("Chr", "Start", "End", "Amp_name", 
                                          "High35-L4-cDNA",
                                          "L1-gDNA_S70",
                                          "L2-gDNA_S71",
                                          "L3-gDNA_S72",
                                          "L4-gDNA_S73",
                                          "Maxiprep_S80",
                                          "NewL4-cDNA_S78",
                                          "OldL1-cDNA_S75",
                                          "OldL2-cDNA_S76",
                                          "OldL3-cDNA_S77",
                                          "OldR1-cDNA_S79",
                                          "R1-gDNA_S74"
)



### Reads count files without duplicates

amp_counts <- data.frame(
  bed,
  "High35-L4-cDNA" = read.table("bwa_mem_amp_counts_dup_rem/STARR-High35-L4-cDNA_S81_L006.sv_counts.txt"),
  "L1-gDNA_S70" = read.table("bwa_mem_amp_counts_dup_rem/STARR-L1-gDNA_S70_L006.sv_counts.txt"),
  "L2-gDNA_S71" = read.table("bwa_mem_amp_counts_dup_rem/STARR-L2-gDNA_S71_L006.sv_counts.txt"),
  "L3-gDNA_S72" = read.table("bwa_mem_amp_counts_dup_rem/STARR-L3-gDNA_S72_L006.sv_counts.txt"),
  "L4-gDNA_S73" = read.table("bwa_mem_amp_counts_dup_rem/STARR-L4-gDNA_S73_L006.sv_counts.txt"),
  "Maxiprep_S80" = read.table("bwa_mem_amp_counts_dup_rem/STARR-Maxiprep_S80_L006.sv_counts.txt"),
  "NewL4-cDNA_S78" = read.table("bwa_mem_amp_counts_dup_rem/STARR-NewL4-cDNA_S78_L006.sv_counts.txt"),
  "OldL1-cDNA_S75" = read.table("bwa_mem_amp_counts_dup_rem/STARR-OldL1-cDNA_S75_L006.sv_counts.txt"),
  "OldL2-cDNA_S76" = read.table("bwa_mem_amp_counts_dup_rem/STARR-OldL2-cDNA_S76_L006.sv_counts.txt"),
  "OldL3-cDNA_S77" = read.table("bwa_mem_amp_counts_dup_rem/STARR-OldL3-cDNA_S77_L006.sv_counts.txt"),
  "OldR1-cDNA_S79" = read.table("bwa_mem_amp_counts_dup_rem/STARR-OldR1-cDNA_S79_L006.sv_counts.txt"),
  "R1-gDNA_S74" = read.table("bwa_mem_amp_counts_dup_rem/STARR-R1-gDNA_S74_L006.sv_counts.txt")
)

colnames(amp_counts) <- c("Chr", "Start", "End", "Amp_name", 
                          "High35-L4-cDNA",
                          "L1-gDNA_S70",
                          "L2-gDNA_S71",
                          "L3-gDNA_S72",
                          "L4-gDNA_S73",
                          "Maxiprep_S80",
                          "NewL4-cDNA_S78",
                          "OldL1-cDNA_S75",
                          "OldL2-cDNA_S76",
                          "OldL3-cDNA_S77",
                          "OldR1-cDNA_S79",
                          "R1-gDNA_S74"
)

# Sanity check confirming duplicate removal = PASSED
#head(amp_counts_with_duplicates, 2)
#head(amp_counts, 2)

# Alex: Amplicons with names that have "UN" are the non-overlapping portions of overlapping amplicons. I think we should remove these from the pool for the modeling.
# amp_counts$Amp_name[grepl("*UN*", amp_counts$Amp_name)]

amp_counts <- dplyr::filter(amp_counts, Amp_name %in% amp_counts$Amp_name[!grepl("*UN*", amp_counts$Amp_name)])

1.1. Metadata

# Sample metadata
metadata <- read.csv("STAR408_metadata.csv")

# Adds data column to metadaa
#rep(colnames(amp_counts)[5:16], each = 2)
metadata$amp_counts_col <- rep(colnames(amp_counts)[5:16], each = 2)

datatable(metadata, 
          rownames = FALSE, 
          filter = list(position = 'top', clear = FALSE, plain = FALSE), 
          options = list(paging = FALSE, scrollX=T, scrollY="500px", searching = FALSE))
## Amplicon classification. 
# All amplicons ever generated in the lab were counted by samtools view. 
# Color and Color2 classifies them to projects and also subclasses of the STAR408.
# Color denounces library type
amp_counts$Color <- "ASD1K"
amp_counts$Color <- ifelse(grepl("CACNA1C", amp_counts$Amp_name), "STAR408", amp_counts$Color)
amp_counts$Color <- ifelse(grepl("_Amp", amp_counts$Amp_name), "ASD100", amp_counts$Color)
amp_counts$Color <- ifelse(grepl("-", amp_counts$Amp_name), "ASD100", amp_counts$Color)
amp_counts$Color <- ifelse(grepl("SCN1A", amp_counts$Amp_name), "STAR408", amp_counts$Color)
amp_counts$Color <- ifelse(grepl("Control", amp_counts$Amp_name), "STAR408", amp_counts$Color)
amp_counts$Color <- ifelse(grepl("SZ108", amp_counts$Amp_name), "STAR408", amp_counts$Color)
amp_counts$Color <- ifelse(grepl("NCASD", amp_counts$Amp_name), "STAR408", amp_counts$Color)
amp_counts$Color <- ifelse(grepl("Epilepsy", amp_counts$Amp_name), "STAR408", amp_counts$Color)

# Color2 indicates a subclass of STAR408
amp_counts$Color2 <- ifelse(grepl("CACNA1C", amp_counts$Amp_name), "CACNA1C", "Non-STAR408")
amp_counts$Color2 <- ifelse(grepl("SCN1A", amp_counts$Amp_name), "SCN1A", amp_counts$Color2)
amp_counts$Color2 <- ifelse(grepl("Control", amp_counts$Amp_name), "Control", amp_counts$Color2)
amp_counts$Color2 <- ifelse(grepl("SZ108", amp_counts$Amp_name), "SZ108", amp_counts$Color2)
amp_counts$Color2 <- ifelse(grepl("NCASD", amp_counts$Amp_name), "NCASD", amp_counts$Color2)
amp_counts$Color2 <- ifelse(grepl("Epilepsy", amp_counts$Amp_name), "Epilepsy", amp_counts$Color2)


#knitr::kable(table(amp_counts$Color), caption = "Number of amplicons per library type generated in the lab")
# Saving a count file for GEO and Supp Table

df_count_file <- dplyr::filter(amp_counts, Color == "STAR408")[,1:16]
colnames(df_count_file) <- c(colnames(df_count_file)[1],
                             paste0(colnames(df_count_file)[2:3], "_GRCh38"),
                             colnames(df_count_file)[4:16])

# Removing R (contralateral) samples from the published dataset. They are not very informative.
df_count_file$`OldR1-cDNA_S79` <- NULL
df_count_file$`R1-gDNA_S74` <- NULL

#write.csv(df_count_file, file = "G:/Shared drives/Nord Lab - Computational #Projects/MPRA/STAR408/STAR408_counts.csv")

#write.csv(df_count_file, file = "G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/biorxiv #submission/20210412/Supp_Tables/STAR408_counts.csv")

library(tools)

md5sum("STAR408_counts.csv")
md5sum("../mini_MPRA/miniMPRA_counts.csv")
# Saving bed coordinates

## PutEnh (Control)
## FBDHS (NCASD)
## GWAS (SZ108, Epilepsy)
## LD (SCN1A, CACNA1C)

groups <- ifelse(grepl("SCN1A|CACNA1C", df_count_file$Amp_name), "LD", df_count_file$Amp_name)
groups <- ifelse(grepl("SZ108|Epilepsy", groups), "GWAS", groups)
groups <- ifelse(grepl("NCASD", groups), "FBDHS", groups)
groups <- ifelse(grepl("Control", groups), "PutEnh", groups)

bed_coord <- df_count_file[,1:4]

bed_coord$Group <- groups

#write.csv(bed_coord, file = "G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/biorxiv #submission/20210412/Supp_Tables/STAR408_bed_GRCh38_coordinates.csv", row.names = F)
# Saving metadata file for Supp table. 

# Removing R (contralateral) samples from the published dataset. They are not very informative.

metadata_cleaned <- metadata[metadata$amp_counts_col != "OldR1-cDNA_S79" & metadata$amp_counts_col != "R1-gDNA_S74",]

# Removing unnecessary columns
metadata_cleaned$RT.batch <- NULL
metadata_cleaned$Bioanalyzer.Notes <- NULL

metadata_cleaned$Biological.rep <- as.factor(metadata_cleaned$Biological.rep)

#write.csv(metadata_cleaned, file = "G:/Shared drives/Nord Lab - Temp #Overflow/STARR-408_Lambert&Su-Feher/biorxiv submission/20210412/Supp_Tables/STAR408_metadata.csv")

2. Amplicon proportions

# Amplicon proportions are calculated relative to the library depth.

# Calculating proportions
# Proportion function here: (x+1)/(sum(x)+1)  # +1 is padding

df <- dplyr::filter(amp_counts, Color == "STAR408")[,4:16]
rownames(df) <- df$Amp_name
df$Amp_name <- NULL

amp.prop <- as.data.frame(apply(df, 2, function(x) { (x+1)/(sum(x, na.rm = T)+1) }))

# Adds additional low constant to limit variability due to low DNA counts. Has no real effect on the data since the added value is equal to only 3 counts.

# Finding the 1st percentile value of non-zero DNA counts:
DNA_prop_values <- as.numeric(as.matrix((amp.prop[,c("L1-gDNA_S70", "L2-gDNA_S71", "L3-gDNA_S72", "L4-gDNA_S73")])))

# Find prop_padding value equivalent to th 1st percentile of counts excluding 0
amp_counts_STAR408 <- filter(amp_counts, Color == "STAR408")

DNA_count_values <- as.numeric(as.matrix((amp_counts_STAR408[,c("L1-gDNA_S70", "L2-gDNA_S71", "L3-gDNA_S72", "L4-gDNA_S73")])))

prop_padding <- as.numeric(quantile(as.numeric(na.omit(DNA_prop_values[DNA_count_values != 0])), 0.01))

# What is the count equivalent of prop_padding?

# finding the nearest DNA proportion value. 
#DNA_prop_values[which.min(abs(DNA_prop_values - prop_padding))]  # 2.66941e-07

df <- data.frame("DNA_count_values" = as.numeric(DNA_count_values), 
                 "DNA_prop_values" = as.numeric(DNA_prop_values))

# df[which.min(abs(DNA_prop_values - prop_padding)),]     # 3 counts

#par(mfrow = c(1,1))

#hist(DNA_prop_values, breaks = 100)
#abline(v = prop_padding, col = "red")

# Adds the small constant
amp.prop <- amp.prop + prop_padding
#amp.prop <- amp.prop

amp.prop$Amplicon <- rownames(amp.prop)
amp.prop.plot <- amp.prop[,1:12]

# Making colnames more compact
colnames(amp.prop.plot) <- c("L4-RNA-35", "L1-DNA", "L2-DNA", 
                             "L3-DNA", "L4-DNA", "Maxiprep",
                             "L4-RNA", "L1-RNA", "L2-RNA",
                             "L3-RNA", "R1-RNA", "R1-DNA")

# Rearranges columns
amp.prop.plot <- amp.prop.plot[,c("L1-DNA", "L2-DNA", "L3-DNA", "L4-DNA",
                                  "L1-RNA", "L2-RNA", "L3-RNA", "L4-RNA",
                                  "L4-RNA-35", "Maxiprep", "R1-DNA", "R1-RNA")]

2.1 Proportion correlations

2.1.1 All samples

ggpairs(log2(amp.prop.plot), progress = FALSE,
             lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16), 
                          combo = "box", discrete = "blank", na = "na"),
             diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"), 
             upper = list(continuous = wrap('cor', size = 3))) + 
  theme_bw()+ 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), text = element_text(size = 8), axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
Amplicon log2 proportion correlations between all samples in the analysis. L1-, L2-, L3-, L4-DNA/RNA are the primary experimental samples. L4-RNA-35 is the high PCR cycle control. R1-DNA/RNA are samples collected from the right hemisphere, contralateral to the MPRA library injection.

Amplicon log2 proportion correlations between all samples in the analysis. L1-, L2-, L3-, L4-DNA/RNA are the primary experimental samples. L4-RNA-35 is the high PCR cycle control. R1-DNA/RNA are samples collected from the right hemisphere, contralateral to the MPRA library injection.

2.1.2 DNA and Maxiprep

Unfiltered 408 amplicons

ggpairs(log2(amp.prop.plot[,c(1:4,10)]), progress = FALSE,
             lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16), 
                          combo = "box", discrete = "blank", na = "na"),
             diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"), 
             upper = list(continuous = wrap('cor', size = 4))) + 
  labs(title = bquote(gDNA~and~Maxiprep~by~log[2](Proportion))) + 
  theme_bw()+ 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), text = element_text(size = 10), 
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
Plasmid Maxiprep library is well correlated with DNA samples, indicating good viral recovery rate after MPRA library injection.

Plasmid Maxiprep library is well correlated with DNA samples, indicating good viral recovery rate after MPRA library injection.

2.1.3 RNA

Unfiltered 408 amplicons

ggpairs(log2(amp.prop.plot[,c(5:9)]), progress = FALSE,
             lower = list(continuous = wrap("points", size = 0.5, alpha = 0.2, pch = 16), combo = "box", discrete = "blank", na = "na"),
             diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"), 
             upper = list(continuous = wrap('cor', size = 4))) + 
  labs(title = bquote(cDNA~by~log[2](Proportion))) + 
  theme_bw()+ 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), text = element_text(size = 10), axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
RNA samples are well correlated with each other.

RNA samples are well correlated with each other.

3. Quality control

3.1 Test for the library contamination

# I'm testing the composition in the 4 main gDNA samples only, ignoring minor differences in library depths.

df_gDNA <- amp_counts[,c(4,6:9,17, 18)]
df_gDNA_m <- melt(df_gDNA)

DT <- data.table(df_gDNA_m)

# Bar plot per library type
lib_composition <- DT[,list(Sum=sum(value)), by=c("Color")]
lib_composition$Fraction <- lib_composition$Sum / sum(lib_composition$Sum)


  ggplot(lib_composition, aes(x = Color, y = Fraction))+
  geom_bar(stat="identity") +
  theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) + 
  labs(x="Library", y="Fraction of counts")
Testing for the presence of amplicons used in our laboratory in the past, is a standard element of our analysis pipeline. The vast majority of counts in this experiment originate from the STAR408 library.

Testing for the presence of amplicons used in our laboratory in the past, is a standard element of our analysis pipeline. The vast majority of counts in this experiment originate from the STAR408 library.

# knitr::kable(lib_composition) 

4. Ratiometric activity

# Calculation of the ratiometric activity

L1_act <- amp.prop.plot$`L1-RNA` / amp.prop.plot$`L1-DNA`
L2_act <- amp.prop.plot$`L2-RNA` / amp.prop.plot$`L2-DNA`
L3_act <- amp.prop.plot$`L3-RNA` / amp.prop.plot$`L3-DNA`
L4_act <- amp.prop.plot$`L4-RNA` / amp.prop.plot$`L4-DNA`
L4H_act <- amp.prop.plot$`L4-RNA-35` / amp.prop.plot$`L4-DNA`
R1_act <- amp.prop.plot$`R1-RNA` / amp.prop.plot$`R1-DNA`

amp.act <- data.frame("amp_id" = rownames(amp.prop.plot),
                      "L1_Activity" = L1_act,
                      "L2_Activity" = L2_act,
                      "L3_Activity" = L3_act,
                      "L4_Activity" = L4_act,
                      "L4-35_Activity" = L4H_act,
                      "R1_Activity" = R1_act)

amp.act <- merge(amp.act, amp_counts[, c(4,17:18)], by.x = "amp_id", by.y = "Amp_name")

4.1 Activity correlation plots

4.1.1 Individual samples

amp.act_plot <- amp.act
rownames(amp.act_plot) <- amp.act_plot$amp_id
amp.act_plot <- amp.act_plot[,2:7]

ggpairs(log2(amp.act_plot), progress = FALSE,
             lower = list(continuous = wrap("points", size = 1, alpha = 0.2, pch = 16), 
                          combo = "box", discrete = "blank", na = "na"),
             diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), na = "blankDiag"), 
             upper = list(continuous = wrap('cor', size = 4)))+ 
        theme_bw()+ 
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
              text = element_text(size = 10), 
              axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

4.1.2 Mean activity correlations

library(genefilter)
library(ggpmisc)

# Calculating average activity and annotating amplicon types. I'm only using the 4 major replicates for calculating the means, skipping the 35 cycle PCR control (replicate), and the contralateral ("R", right hemisphere) sample.


amp.prop.plot$DNA_prop_mean <- rowMeans(as.matrix(amp.prop.plot[,1:4]))
amp.prop.plot$DNA_prop_SD <- rowSds(as.matrix(amp.prop.plot[,1:4]))

amp.prop.plot$RNA_prop_mean <- rowMeans(as.matrix(amp.prop.plot[,5:8]))
amp.prop.plot$RNA_prop_SD <- rowSds(as.matrix(amp.prop.plot[,5:8]))

# This is to indicate amplicons with RNA_prop_mean > DNA_prop_mean
amp.prop.plot$Sign <- ifelse(amp.prop.plot$RNA_prop_mean / amp.prop.plot$DNA_prop_mean > 1, "Positive", "Negative")

formula <- y ~ x

ggplot(amp.prop.plot, aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean)))+
  geom_point(alpha = 0.3, aes(color = Sign))+
  scale_color_manual(values = c("blue", "red")) + 
  labs(title = "Mean log2 DNA vs RNA prop.", x = bquote(log[2](proportion[DNA])), y = bquote(log[2](proportion[RNA])))+
  geom_abline(intercept = 0, color = "red", linetype = 2)+
  geom_smooth(formula = formula, method = 'lm', color = "black")+
  stat_poly_eq(formula = formula, parse = TRUE, size = 4)+
  geom_abline(intercept = 0, color = "red", linetype = 2)+
  coord_fixed()+ 
  theme_bw()+ 
  theme(text = element_text(size = 12), plot.title = element_text(size = 14))+ 
  theme(plot.title = element_text(hjust = 0.5))



# STAR408 amplicons split into subcategories

# Adding Colors indicating library type from amp_counts
amp.prop.plot$Amp_name <- rownames(amp.prop.plot)
amp.prop.plot <- merge(amp.prop.plot, amp_counts[,c(4,17,18)], by = "Amp_name")

j_brew_colors <- brewer.pal(n = 8, name = "Set1")[c(1:6)]
j_brew_colors <- c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3","#FF7F00", "#abab20")

ggplot(amp.prop.plot, aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean)))+
  geom_point(alpha = 0.4, aes(color = Color2))+
  labs(title = "DNA vs RNA proportion", 
       x = bquote(log[2](proportion[DNA])), 
       y = bquote(log[2](proportion[RNA])), 
       color = NULL)+
  geom_abline(intercept = 0, color = "red", linetype = 2)+
  geom_smooth(formula = formula, method = 'lm', color = "black")+
  stat_poly_eq(formula = formula, parse = TRUE)+
  geom_abline(intercept = 0, color = "red", linetype = 2)+
  coord_fixed() + 
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5), text = element_text(size = 10), legend.position = "none")+
  scale_color_manual(values = j_brew_colors)+
  facet_wrap(~ Color2)
DNA vs RNA proportions demonstrate good linear correlation. Figure split by amplicon category.DNA vs RNA proportions demonstrate good linear correlation. Figure split by amplicon category.

DNA vs RNA proportions demonstrate good linear correlation. Figure split by amplicon category.

4.1.2 Mean activity barplot

# Calculate average activity and annotating amplicon types. I'm only using the 4 
# major replicates for calculating the means, skipping the 35 cycle PCR control, and the contralateral ("R") sample. 

amp.act$Mean_act <- rowMeans(as.matrix(amp.act[,c(2:5)]))
amp.act$Mean_act_log2 <- rowMeans(as.matrix(log2(amp.act[,c(2:5)])))
amp.act$SD <- rowSds(as.matrix(amp.act[,c(2:5)]))
amp.act$SD_log2 <- rowSds(as.matrix(log2(amp.act[,c(2:5)])))

amp.act2 <- arrange(amp.act, Mean_act)
amp.act2$amp_id <- factor(amp.act2$amp_id, levels = amp.act2$amp_id)

# log2 Mean activity of all 408 amplicons, color coded by amplicon type
ggplot(amp.act2, aes(x = amp_id, y = log2(Mean_act), fill = Color2))+
  geom_bar(stat="identity")+
  labs(fill = NULL) + 
  theme_bw()+
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title.x=element_blank(),
    axis.text.x=element_blank(),
    axis.ticks.x=element_blank())+
  scale_fill_manual(values = j_brew_colors)+
  geom_errorbar(aes(ymin = log2(Mean_act) - log2(SD), ymax = log2(Mean_act) + log2(SD)), width = 0.1, alpha = 0.2)

# This builds a summary data frame containing counts, proportions, activity, and average values. 

##### Counts #####
amp_counts_df <- dplyr::filter(amp_counts, Color == "STAR408")
amp_counts_df <- amp_counts_df[,c(4, 1:3, 17, 18, 6:9, 12:14, 11)]
colnames(amp_counts_df) <- c(colnames(amp_counts_df)[1:6], "L1_DNA_count", "L2_DNA_count", "L3_DNA_count", "L4_DNA_count",
                             "L1_RNA_count", "L2_RNA_count", "L3_RNA_count", "L4_RNA_count")

amp_counts_df$DNA_count_mean <- rowMeans(as.matrix(amp_counts_df[,7:10]))
amp_counts_df$DNA_count_SD <- rowSds(as.matrix(amp_counts_df[,7:10]))

amp_counts_df$RNA_count_mean <- rowMeans(as.matrix(amp_counts_df[,11:14]))
amp_counts_df$RNA_count_SD <- rowSds(as.matrix(amp_counts_df[,11:14]))

# Testing for > 200 counts in 4 DNA samples. DNA threshold. 
min_count = 200
amp_counts_df$Pass_DNA_count <- rowSums(amp_counts_df[,c(7:10)]>min_count) >=4

##### Proportions #####
amp.prop.plot_df <- amp.prop.plot[,c(1:9, 14:18)]
# Sanity check:
#data.frame("one" = colnames(amp.prop.plot_df), 
#           "two" = c("Amp_name", "L1_DNA_prop", "L2_DNA_prop", "L3_DNA_prop", "L4_DNA_prop",
#                     "L1_RNA_prop", "L2_RNA_prop", "L3_RNA_prop", "L4_RNA_prop",
#                     "DNA_prop_mean", "DNA_prop_SD", "RNA_prop_mean", "RNA_prop_SD", "Sign"))
                     
                     
colnames(amp.prop.plot_df) <- c("Amp_name", "L1_DNA_prop", "L2_DNA_prop", "L3_DNA_prop", "L4_DNA_prop",
                               "L1_RNA_prop", "L2_RNA_prop", "L3_RNA_prop", "L4_RNA_prop",
                               "DNA_prop_mean", "DNA_prop_SD", "RNA_prop_mean", "RNA_prop_SD", "Sign")
                               
                      
##### Activity #####
amp.act_df <- amp.act_plot
amp.act_df$Amp_name <- rownames(amp.act_df)
rownames(amp.act_df) <- NULL
amp.act_df <- amp.act_df[,c(7, 1:4)]

amp.act_df$Mean_act <- rowMeans(as.matrix(amp.act_df[,c(2:5)]))  
amp.act_df$Mean_act_SD <- rowSds(as.matrix(amp.act_df[,c(2:5)]))


x <- merge(amp_counts_df, amp.prop.plot_df, by = "Amp_name")
y <- merge(x, amp.act_df, by = "Amp_name")

#Adding Maxi prep stats
y <- merge(x, amp.act_df, by = "Amp_name")

Maxi_counts <- dplyr::filter(amp_counts, Color == "STAR408")[,c(4, 10)]
colnames(Maxi_counts) <- c("Amp_name", "Maxi_counts")

Maxi_prop <- amp.prop[,c(13, 6)]
rownames(Maxi_prop) <- NULL
colnames(Maxi_prop) <- c("Amp_name", "Maxi_prop")

Maxi <- merge(Maxi_counts, Maxi_prop)

y <- merge(y, Maxi, by = "Amp_name")

# Comparing to the previous version of this script this CSV has "L4.35_Activity" "R1_Activity" now.
#write.csv(file = "./CSV_output_files/STAR408_counts_proportions_activity.csv", y)

5. GC content

Explored analysis with 322 amplicons passing > 200 counts in 4 DNA samples

# Yurong calculated these values. I later checked they match values from in-silico PCR.
GC_content <- read.table("./STAR408_Amplicons_GC.txt", header = TRUE)

# Adding GC column to the cleaned and full (y) dfs
y <- merge(y, GC_content, by.x = "Amp_name", by.y = "Amplicon")

df_clean <- dplyr::filter(y, Pass_DNA_count == TRUE) # 322 rows

# Lets see how GC content correlates with activity and abundance.
formula <- y ~ x

ggplot(df_clean, aes(x = log2(DNA_prop_mean), y = GC))+
  geom_point(size = 0.5, alpha = 0.5)+
  geom_smooth(formula = formula, method = 'lm', color = "black")+
  #stat_poly_eq(formula = formula, parse = TRUE, eq.with.lhs = TRUE)+
  stat_poly_eq(formula = formula, 
               aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
               parse = TRUE, size = 8)+ 
  labs(title = "DNA proportion vs GC content", 
       x = bquote(log[2](mean(prop[DNA]))), 
       y = "Amplicon GC Content")+ 
  theme_bw()+ 
  theme(text = element_text(size = 12), plot.title = element_text(size = 12))+
  theme(plot.title = element_text(hjust = 0.5))+
  theme(plot.title = element_text(hjust = 0.5))
GC content is positively corelated with DNA abundance suggesting that GC content is a predictor or a confounder of the activity.

GC content is positively corelated with DNA abundance suggesting that GC content is a predictor or a confounder of the activity.

## Conclusions 
# GC content is positively corelated with gDNA abundance suggesting that GC content is a good predictor or a confounder of the activity modeling. 
ggplot(df_clean, aes(x = log2(RNA_prop_mean), y = GC))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(formula = formula, method = 'lm', color = "black")+
  #stat_poly_eq(formula = formula, parse = TRUE, eq.with.lhs = TRUE)+
  stat_poly_eq(formula = formula, 
               aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
               parse = TRUE, size = 8) + 
  labs(title = "RNA proportion vs GC content", x = bquote(log[2](mean(prop[RNA]))), y = "Amplicon GC Content") + 
  theme_bw()+ 
  theme(text = element_text(size = 12), plot.title = element_text(size = 12))+
  theme(plot.title = element_text(hjust = 0.5))
RNA abundance does not demonstrate this positive correlation

RNA abundance does not demonstrate this positive correlation

ggplot(df_clean, aes(x = log2(Mean_act), y = GC))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(formula = formula, method = 'lm', color = "black")+
  #stat_poly_eq(formula = formula, parse = TRUE, eq.with.lhs = TRUE)+
  stat_poly_eq(formula = formula, 
               aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
               parse = TRUE, size = 8) + 
  labs(title = "Activity vs GC content", x = bquote(log[2](mean(Activity))), y = "Amplicon GC Content") + 
  theme_bw()+
  theme(text = element_text(size = 12), plot.title = element_text(size = 12))+
  theme(plot.title = element_text(hjust = 0.5))
Activity is negatively correlated with GC content

Activity is negatively correlated with GC content

# Color code the GC content in the gDNA vs cDNA prop plot.
# Split at GC = 0.5
y$GC_above_mean <- ifelse(y$GC > mean(y$GC), "TRUE", "FALSE")
df_clean <- dplyr::filter(y, Pass_DNA_count == TRUE)

# Split at mean GC = 0.4063235 <--- 0.4165686? 
ggplot(df_clean, aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean), color = GC))+
  geom_point(size = 1.5, alpha = 0.9)+
  geom_smooth(formula = formula, method = 'lm', color = "black")+ 
  stat_poly_eq(formula = formula, label.y = "bottom", label.x = "right",
               aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
               parse = TRUE, size = 4)+
  scale_color_gradient2(midpoint = mean(df_clean$GC), low="blue3", mid="white",
                        high="red3", space ="Lab" )+
  labs(title = "RNA and DNA vs GC content", 
       x = bquote(log[2](mean(prop[DNA]))), 
       y = bquote(log[2](mean(prop[RNA]))))+ 
  theme_bw()+
  theme(text = element_text(size = 12), plot.title = element_text(size = 12))+
  facet_wrap(~ GC_above_mean)+
  theme(plot.title = element_text(hjust = 0.5))
DNA vs RNA proportions split at the mean value of the GC content

DNA vs RNA proportions split at the mean value of the GC content

ggplot(df_clean, aes(x = log2(Maxi_prop), y = GC))+
  geom_point(size = 0.5, alpha = 0.5)+
  geom_smooth(formula = formula, method = 'lm', color = "black")+ 
  stat_poly_eq(formula = formula, label.y = "top", label.x = "left",
               aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
               parse = TRUE, size = 8)+
  scale_color_gradient2(midpoint = mean(df_clean$GC), low="blue3", mid="white",
                        high="red3", space ="Lab" )+
  labs(title = "Maxiprep vs GC content", x = bquote(log[2](prop[Maxiprep])), y = "Amplicon GC Content") + 
  theme_bw()+
  theme(text = element_text(size = 12), plot.title = element_text(size = 12), aspect.ratio = 1)+
  theme(plot.title = element_text(hjust = 0.5))
Maxiprep proportions are correlated with GC content similarly to DNA proportions. This suggests that the library prep process favors amplicons with higher GC content.

Maxiprep proportions are correlated with GC content similarly to DNA proportions. This suggests that the library prep process favors amplicons with higher GC content.

6. In silico PCR

  • Objectives
    • Reassess amplicon amplification specificity
    • Report amplicon coordinates in GRCh38
    • Independently verify GC content data
# Saving a Supp Table with primers
primers <- read.csv("In_silico_PCR_STAR408/STAR408_primers.csv", header = TRUE, fill = TRUE)

# Indicating reference genome
colnames(primers)[8] <- "Amplicon.Range_hg19"

#write.csv(primers, file = "G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/biorxiv submission/20210412/Supp_Tables/STAR408_primers.csv")
# Integration with In silico PCR analysis

# In silico PCR results and scripts are located at "G:\Shared drives\Nord Lab Team Drive\Projects\MPRA_ComputationalAnalysis\STAR408_ComputationalAnalysis\STAR408_KC\In_silico_PCR_STAR408"

# Primers were designed using hg19 genome. 
primers <- read.csv("In_silico_PCR_STAR408/STAR408_primers.csv", header = TRUE, fill = TRUE)

# In silico PCR was run in batches:

a <- read.csv("In_silico_PCR_STAR408/STAR408_in_silico_PCR__1_to_10.csv", header = TRUE)
b <- read.csv("In_silico_PCR_STAR408/STAR408_in_silico_PCR__11_to_100.csv", header = TRUE)
c <- read.csv("In_silico_PCR_STAR408/STAR408_in_silico_PCR__101_to_120.csv", header = TRUE)
d <- read.csv("In_silico_PCR_STAR408/STAR408_in_silico_PCR__121_to_200.csv", header = TRUE)
e <- read.csv("In_silico_PCR_STAR408/STAR408_in_silico_PCR__201_to_250.csv", header = TRUE)
f <- read.csv("In_silico_PCR_STAR408/STAR408_in_silico_PCR__251_to_300.csv", header = TRUE)
g <- read.csv("In_silico_PCR_STAR408/STAR408_in_silico_PCR__301_to_350.csv", header = TRUE)
h <- read.csv("In_silico_PCR_STAR408/STAR408_in_silico_PCR__351_to_400.csv", header = TRUE)
i <- read.csv("In_silico_PCR_STAR408/STAR408_in_silico_PCR__401_to_408.csv", header = TRUE)

df_in_silico <- rbind(a, b, c, d, e, f, g ,h, i)
df_in_silico$X <- NULL
df_in_silico$CHROMOSOME <- NULL

colnames(df_in_silico)[13] <- "Chr_GRCh38_pred"
colnames(df_in_silico)[14] <- "Amplicon_Start_GRCh38"
colnames(df_in_silico)[15] <- "Amplicon_End_GRCh38"
colnames(df_in_silico)[16] <- "Amplicon_length_GRCh38"
colnames(df_in_silico)[18] <- "Sequence_GRCh38"

#head(df_in_silico[,1:17])
#dim(df_in_silico)  # 479 PCR products vs 408 amplicons or primer pairs.

## Linda modification: fix weird column factor issues 
## e.g. 
# > levels(df_in_silico$Multiple.Amplicons.)
# [1] "No"   "Yes"  "No  " ## in addition to chr and other columns
# KC: thanks!
colnames(df_in_silico)[8] <- "Multiple.Amplicons"
df_in_silico$Multiple.Amplicons <- trimws(df_in_silico$Multiple.Amplicons)
df_in_silico$chr <- trimws(df_in_silico$chr)

# Flag amplicons with multiple PCR products
# There are 8 unspecific amplicons in In silico PCR with max product size maxProductSize = 1500 bp.
# Most by products have much shorter lengths, making them likely to be packaged into viruses. The lack of PCR specificity decreases its efficiency.

df <- as.data.frame(table(df_in_silico$UNIQID))
unspecific_PCR <- dplyr::filter(df, Freq > 1)  

# Unspecific amplicons and their product sizes
#dplyr::filter(df_in_silico, UNIQID %in% unspecific_PCR$Var1)[,c(1,16)]

# In_silico_specificity = In silico PCR predicted multiple products.
y$Perfect_In_silico_specificity <- ifelse(y$Amp_name %in% as.character(unspecific_PCR$Var1), FALSE, TRUE)

# Sanity check
#filter(y, Perfect_In_silico_specificity == FALSE)

# GC content calculation
library(stringr)

GC_cont <- function(x){
  
  x <- toupper(x) # just in case
  num_g <- str_count(x, "G")
  num_c <- str_count(x, "C")
  gc_content <- (num_g + num_c) / str_length(x)
  gc_content
  
}

df_in_silico$GC2 <- GC_cont(df_in_silico$Sequence)

# I'm flagging amplicons as intended if:
# 1) chromosomes predicted by isPCR and chromosomes expected from primer design match, 
# 2) isPCR product length is +/- 5 bp from the product predicted during primer design on hg19 genome. 

# Fixing X chrom name and chrom class encoding to permit logical operations
# an ugly hack...
df_in_silico$chr <- ifelse(is.na(as.numeric(df_in_silico$chr)), 23, as.numeric(df_in_silico$chr))
df_in_silico$Chr_GRCh38_pred <- as.numeric(df_in_silico$Chr_GRCh38_pred)
df_in_silico$chr <- as.numeric(df_in_silico$chr)

# NA due to a lack of 1 isPCR product was messing up with the analysis - removes 1 record.
d <- na.omit(df_in_silico)

# Sanity checks
#unique(setdiff(df_in_silico$UNIQID, d$UNIQID))
#dplyr::filter(df_in_silico, UNIQID %in% unique(setdiff(df_in_silico$UNIQID, d$UNIQID)))
# 

d$Intended_interval <- sapply(1:nrow(d), function(x) d$Amplicon.Length[x] 
                              %in% c((d$Amplicon_length_GRCh38[x] - 5) : (d$Amplicon_length_GRCh38[x] + 5)))

# Offending amplicon with no PCR product
#setdiff(df_in_silico$UNIQID, d$UNIQID)   #  "274_SZ108"

# Extracting that extra amplicon which fail to amplify in in silico PCR
am274 <- dplyr::filter(df_in_silico, UNIQID == "274_SZ108")
am274$Intended_interval <- NA

d2 <- rbind(d, am274)  # Adding this failed amplicon to the full dataset

# Checks if the chrom number is as expected
d2$Intended_chr <- d2$chr == d2$Chr_GRCh38_pred

d2$Intended_amplicon <- d2$Intended_chr & d2$Intended_interval

#sum(na.omit(d2$Intended_amplicon))  # 406 amplicons meeting Intended amplicon criteria

# Manual inspection
#dim(dplyr::filter(d2,  Intended_amplicon == FALSE)) # 72 spurious amplicons
PCR_byproducts <- dplyr::filter(d2,  Intended_amplicon == FALSE) # 71 PCR byproducts 
#PCR_byproducts[,c(1, 6, 16, 20, 12, 13, 21:22)]

# Are they the same UNIQIDs as previously identified multi-product IDs?  YES.
# There is no need to update y
#unique(PCR_byproducts$UNIQID)  #9 + "274_SZ108"

# Manual check if these non specific amplicons have at least 1 specific/intended product
# dplyr::filter(d2, UNIQID %in% as.character(unique(PCR_byproducts$UNIQID)))[,c(1, 6, 16, 20, 12, 13, 21:22)]

# 16_CACNA1C - 1 specific amplicon
# 51_SCN1A   - 1 specific amplicon
# 99_SCN1A   - 1 specific amplicon
# 132_NCASD  - 1 specific amplicon
# 220_NCASD  - 1 specific amplicon
# 277_SZ108  - 1 amplicon 7 bp shorter than expected
# 289_SZ108  - 1 specific amplicon
# 290_SZ108  - 1 specific amplicon
# 355_SZ108  - 1 specific amplicon

# Marking amplicons with at least 1 specific PCR product
x <- unique(dplyr::filter(d2, Intended_amplicon == TRUE)$UNIQID)

# What a mess! There are differences in amplicon names :/
#setdiff(d2$UNIQID, y$Amp_name)
#setdiff(y$Amp_name, d2$UNIQID)

y$Amp_name2 <- gsub("Control_.*", "Controls", y$Amp_name)

# Good. Now we have the same Amp_name reference
#setdiff(y$Amp_name2, d2$UNIQID) # character(0)
#setdiff(d2$UNIQID, y$Amp_name2) # character(0)

y$At_least_1_specific_in_sil_prod <- ifelse(y$Amp_name2 %in% x, TRUE, FALSE) #406 of 408

# Amplicons without at least 1 specific PCR product
# dplyr::filter(y, At_least_1_specific_in_sil_prod == FALSE)$Amp_name  # 2 amplicons <--- Linda: where is this 2 coming from?
# KC: 274 doesn't have an in-silico PCR product, "277_SZ108" have no PCR product flagged as potentially specific.

# Sanity check = PASSED
# Are all amplicons in y also in the in_silico
#length(unique(primers$Amplicon.ID)) # 408
#length(unique(df_in_silico$UNIQID)) # 408
#length(unique(d$UNIQID)) # 407
#length(unique(d2$UNIQID)) # 408

#dplyr::filter(y, Perfect_In_silico_specificity == FALSE)  # 8 amplicons
#dplyr::filter(y, At_least_1_specific_in_sil_prod == FALSE) # 2 amplicons

# Let's flag amplicons for removal from the training model
d=data.frame(x1=c(-Inf, -Inf), x2=c(+Inf, -15), y1=c(-Inf,-15), y2=c(-15, +Inf))
#
ggplot()+
  geom_rect(data=d, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2), color = NA, alpha = 0.3)+
  geom_point(data = dplyr::filter(y, Perfect_In_silico_specificity == TRUE), size = 1, alpha = 0.5, 
             aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean), color = Perfect_In_silico_specificity))+
  geom_point(data = dplyr::filter(y, Perfect_In_silico_specificity == FALSE), size = 1, alpha = 1,  
             aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean), color = Perfect_In_silico_specificity))+
  
  labs(color = "Perfect in-silico PCR specificity", 
       title = "In-islico PCR prediction of multiple PCR products with less than 1500 bp")+
  scale_color_manual(values = c("red", "black")) + 
  theme_bw() + 
  theme(text = element_text(size = 8), aspect.ratio = 1, plot.title = element_text(hjust = 0.5))


# In silico PCR correctly predicts failed products.
ggplot()+
  geom_rect(data=d, mapping=aes(xmin=x1, xmax=x2, ymin=y1, ymax=y2), color=NA, alpha=0.3)+ 
  geom_point(data = dplyr::filter(y, At_least_1_specific_in_sil_prod == TRUE), size = 1, alpha = 0.5, 
             aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean), color = At_least_1_specific_in_sil_prod))+
  geom_point(data = dplyr::filter(y, At_least_1_specific_in_sil_prod == FALSE), size = 1, alpha = 0.5, 
             aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean), color = At_least_1_specific_in_sil_prod))+
  scale_color_manual(values = c("red", "black")) + 
  
  labs(color = "In-silico PCR indicating at least 1 product", 
       title = "In-islico PCR prediction of any PCR products with less than 1500 bp")+
  theme_bw()+ 
  theme(text = element_text(size = 8), aspect.ratio = 1, plot.title = element_text(hjust = 0.5))
In-silico PCR prodiction of product specificity indicates high accuracy of the initial design. MPRA amplicons with predicted byproducts are not biasing the analysis. Gray area indicates amplicons with low representation removed from MPRA activity modeling.In-silico PCR prodiction of product specificity indicates high accuracy of the initial design. MPRA amplicons with predicted byproducts are not biasing the analysis. Gray area indicates amplicons with low representation removed from MPRA activity modeling.

In-silico PCR prodiction of product specificity indicates high accuracy of the initial design. MPRA amplicons with predicted byproducts are not biasing the analysis. Gray area indicates amplicons with low representation removed from MPRA activity modeling.

# Merging isPCR analysis with a bed file

bed$UNIQID <- gsub("Control_.*", "Controls", bed$Amp_name)

df_hg38 <- merge(bed, df_in_silico, by = "UNIQID", all.y = T)

# df_hg38$ChrSub <- df_hg38$Chr_GRCh38_pred - df_hg38$Chr
df_hg38$StartSub <- df_hg38$Amplicon_Start_GRCh38 - df_hg38$Start
df_hg38$EndSub <- df_hg38$Amplicon_End_GRCh38 - df_hg38$End
df_hg38_trunc <- df_hg38[,c("Amp_name", "Chr", "Start", "End", 
                            "Amplicon.Length", "Chr_GRCh38_pred", 
                            "Amplicon_Start_GRCh38", "Amplicon_End_GRCh38", 
                            "Amplicon_length_GRCh38",
                            "StartSub", "EndSub")]

# ISP_FLAG - Matching Start and Stop in bed and isPCR.
# This marks unspecific amplicons by identifying those with Start and End coordinates not matching after  merging a bed file with df_in_silico.
# ISP_FLAG == FALSE - isPCR specific
# ISP_FLAG == TRUE - isPCR unspecific
# sum(na.omit(df_hg38_trunc$ISP_FLAG == FALSE))  # 402 specific amplicons
# sum(na.omit(df_hg38_trunc$ISP_FLAG == TRUE)) # 76 unspecific

df_hg38_trunc$ISP_FLAG <- ifelse(df_hg38_trunc$StartSub == 0 & df_hg38_trunc$EndSub == 0, FALSE, TRUE)


# Add calculated GC values. This is to double check if Yurong's GC content calculation is correct. 
# Yes, it is pretty much spot on.
GC_intended_products <- dplyr::filter(d2, Intended_interval == TRUE)[,c(1, 19)]

y_w_2_GC_values <- merge(y, GC_intended_products, by.x = "Amp_name2", by.y = "UNIQID")

# GC and GC2 content is pretty much the same. 
#ggplot(y_w_2_GC_values, aes(x = GC, y =  GC2))+
#  geom_point()+
#  geom_abline(intercept = 0, color = "red", linetype = 2)+
#  theme_bw()

lm_GC_Y_vs_C <- lm(GC2 ~ GC, data = y_w_2_GC_values)

y_w_2_GC_values$GC_Y_vs_C_Residuals <- as.numeric(lm_GC_Y_vs_C$residuals)


# head(arrange(y_w_2_GC_values, desc(GC_Y_vs_C_Residuals)))

7. Linear model

  • Build a linear model (+/- GC) on amplicons that pass the following criteria:
    • RNA proportions > 2^-15
    • DNA proportions > 2^-15
    • non-specific in silico PCR removed
    • top/bottom 10% data removed
# excluded samples (L4-35 and R1)
data.all.incl <- merge(amp.prop.plot, amp.act, by.x = "Amp_name", by.y = "amp_id") # 408 x 32
data.all.incl <- merge(data.all.incl, GC_content, by.x = "Amp_name", by.y = "Amplicon") # 408 x 33

data.all <- y ## y is from's Karol's analysis
data.all$Amp_name <- as.character(data.all$Amp_name)
data.all$Amp_name2 <- as.character(data.all$Amp_name2)
rownames(data.all) <- data.all$Amp_name

## Linda: What is MeanRatio vs MeanRatio_MoM???
## A: MeanRatio = rowMeans(L1-L4 ratiometric activity)
## A: MeanRatio_MoM = rowMeans(amp.prop.plot[,5:8]) / rowMeans(amp.prop.plot[,1:4])

## KC: lm is based on MoM
## Linda: Just to make sure, remake MeanRatio_MoM and MeanRatio with more explicit names
data.all$MeanRatio_MoM <- data.all$RNA_prop_mean / data.all$DNA_prop_mean
data.all$MeanRatio <- rowMeans(data.all[,c("L1_Activity", "L2_Activity", "L3_Activity", "L4_Activity")])
data.all$MeanRatio_sd <- rowSds(data.all[,c("L1_Activity", "L2_Activity", "L3_Activity", "L4_Activity")])
data.all$MSD <- data.all$MeanRatio - data.all$MeanRatio_sd


## Change names to figure groups: 
## PutEnh (Control)
## FBDHS (NCASD)
## GWAS (SZ108, Epilepsy)
## LD (SCN1A, CACNA1C)

data.all$Amp_name <- gsub("NCASD", "FBDHS", data.all$Amp_name)
data.all$Amp_name2 <- gsub("NCASD", "FBDHS", data.all$Amp_name2)
data.all$Color2 <- gsub("NCASD", "FBDHS", data.all$Color2)
data.all$Amp_name <- gsub("Epilepsy", "EPIL", data.all$Amp_name)
data.all$Amp_name2 <- gsub("Epilepsy", "EPIL", data.all$Amp_name2)
data.all$Color2 <- gsub("Epilepsy", "EPIL", data.all$Color2)
data.all$Amp_name <- gsub("Control", "PutEnh", data.all$Amp_name)
data.all$Amp_name2 <- gsub("Controls", "PutEnh", data.all$Amp_name2)
data.all$Color2 <- gsub("Control", "PutEnh", data.all$Color2)
data.all$Amp_name <- gsub("SZ108", "SCZ", data.all$Amp_name)
data.all$Amp_name2 <- gsub("SZ108", "SCZ", data.all$Amp_name2)
data.all$Color2 <- gsub("SZ108", "SCZ", data.all$Color2)

data.all$Amp_Full_Name <- data.all$Amp_name
data.all$Amp_name <- gsub("EPIL|SCZ", "GWAS", data.all$Amp_name)
data.all$Amp_name2 <- gsub("EPIL|SCZ", "GWAS", data.all$Amp_name2)
data.all$Color2 <- gsub("EPIL|SCZ", "GWAS", data.all$Color2)
data.all$Amp_name <- gsub("SCN1A|CACNA1C", "LD", data.all$Amp_name)
data.all$Amp_name2 <- gsub("SCN1A|CACNA1C", "LD", data.all$Amp_name2)
data.all$Color2 <- gsub("SCN1A|CACNA1C", "LD", data.all$Color2)

## change rownames to match Amp_name
rownames(data.all) <- data.all$Amp_name

## set order of groups for plotting
data.all$Color2 <- factor(data.all$Color2, levels = c("GWAS", "LD", "FBDHS", "PutEnh"))

## remove previous lm columns
data.all$fit <- NULL
data.all$lwr <- NULL
data.all$upr <- NULL
data.all$Residuals <- NULL
data.all$Residuals_Z_scaled_to_lm <- NULL
data.all$Pvalue <- NULL

## dna threshold
min.dna.count <- 200
data.all$Pass_DNA_count <- rowSums(data.all[,c(7:10)] > min_count) >= 4 # F: 86; T: 322 
data.all$Pass_MeanRatio_SD <- data.all$MSD > 0 # means only pass amplicons where mean > 1 s.d., F: 122; T: 286 

# KC: MSD is a metric identifying amplicons which mean DNA ratio is greater than 0. Identifies amplicons which mean DNA ration is unreliable.


## REMOVE 265_ARID1B because effectively duplicate of 252_ARID1B
data.all <- data.all[data.all$Amp_Full_Name != "265_PutEnh_Arid1b_3", ]


## DNA + RNA prop threshold, PASS  = 323 
data.all$Pass_prop_DNA_and_RNA <- ifelse(data.all$RNA_prop_mean > 2^-15 & 
                                         data.all$DNA_prop_mean > 2^-15, TRUE, FALSE) 

## DNA prop + RNA prop + DNA count threshols. PASS = 308
data.all$Passed_QC <- ifelse(data.all$Pass_DNA_count == TRUE & 
                             data.all$Pass_prop_DNA_and_RNA == TRUE, T, F)

## DNA prop + RNA prop + DNA count threshols + Positive MSD. PASS = 219
data.all$Passed_QC_with_SD <- ifelse(data.all$Pass_DNA_count == TRUE & 
                                     data.all$Pass_prop_DNA_and_RNA == TRUE & 
                                     data.all$Pass_MeanRatio_SD == TRUE, TRUE, FALSE)



## subset to data passing both DNA count and proportion thresholds 
data.clean <- dplyr::filter(data.all, Passed_QC == TRUE & At_least_1_specific_in_sil_prod == TRUE) # 308 amplicons

## 

data.clean$Act_Rank_Clean <- rank(data.clean$MeanRatio_MoM) 


## find the top 10% and bottom% amplicons by mean activity ranking
top10 <- top_frac(data.clean, n = 0.1, wt = Act_Rank_Clean)
bottom10 <- top_frac(data.clean, n = -0.1, wt = Act_Rank_Clean)
remove.from.training <- c(as.character(top10$Amp_name), as.character(bottom10$Amp_name))
data.clean$training <- ifelse(data.clean$Amp_name %in% remove.from.training, FALSE, TRUE)

### Training dataset ###
ggplot(data.clean, aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean), color = training))+
  geom_point()+
  theme_bw()+
  scale_color_manual(values = c("red", "grey"))

## make data for lm  
data.for.lm <- data.clean # 308
data.for.lm <- dplyr::filter(data.for.lm, training == TRUE) # 248 amplicons used for training lm 

7.1 GC content in lm

GC content is a valid covariate of the lm, significantly improving its performance.

## do lm with and without gc as variable on mean prop data
mod_training_with_GC <- lm(log2(RNA_prop_mean) ~ log2(DNA_prop_mean) + GC, data = data.for.lm)
mod_training_without_GC <- lm(log2(RNA_prop_mean) ~ log2(DNA_prop_mean), data = data.for.lm)

7.1.1 lm model without GC

summary(mod_training_without_GC)
## 
## Call:
## lm(formula = log2(RNA_prop_mean) ~ log2(DNA_prop_mean), data = data.for.lm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.89246 -0.54850  0.06002  0.61099  1.62332 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -0.78093    0.33900  -2.304   0.0221 *  
## log2(DNA_prop_mean)  0.92453    0.03837  24.094   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8307 on 246 degrees of freedom
## Multiple R-squared:  0.7024, Adjusted R-squared:  0.7012 
## F-statistic: 580.5 on 1 and 246 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(mod_training_without_GC)

7.1.2 lm model with GC

summary(mod_training_with_GC)
## 
## Call:
## lm(formula = log2(RNA_prop_mean) ~ log2(DNA_prop_mean) + GC, 
##     data = data.for.lm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.88097 -0.48373 -0.00093  0.51671  1.81914 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.93109    0.56097   3.442 0.000678 ***
## log2(DNA_prop_mean)  1.01904    0.03944  25.838  < 2e-16 ***
## GC                  -4.52937    0.77173  -5.869 1.42e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7794 on 245 degrees of freedom
## Multiple R-squared:  0.7391, Adjusted R-squared:  0.7369 
## F-statistic: 346.9 on 2 and 245 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(mod_training_with_GC)

BIC without GC covariate

BIC(mod_training_without_GC)
## [1] 626.2976

BIC with GC covariate. Lower value justifies GC addition to the model.

BIC(mod_training_with_GC)
## [1] 599.1863
## use lm to predict residuals of dataset for prediction (data.predict)
# consider data.clean with 308 amplicons 

# data.predict <- dplyr::filter(data.all, Pass_DNA_count == TRUE) # 321 amplicons

# NOTE: This edit has not been approved yet:
data.predict <- data.clean  # KC: replaced by data.clean to limit the confusion from showing amplicons with reduced activity that do not pass stringent QC.

predict_data.predict <- predict(mod_training_with_GC, newdata = data.predict, interval = "confidence")
predict_data.predict <- as.data.frame(predict_data.predict)

## append both lms to data.predict
data.predict <- data.frame(data.predict, predict_data.predict)
data.predict$Residuals_Manual <- log2(data.predict$RNA_prop_mean) - data.predict$fit


## calculate residual z-scores (res - mean)/sd
## z-scores are scaled to training distribution used for fitting  

# I'm scaling the Z scores to the training distribution, the one used for fitting the model
lm_sd <- sd(mod_training_with_GC$residuals)
lm_mean <- mean(mod_training_with_GC$residuals)

data.predict$Residuals_Z_scaled_to_lm <- (data.predict$Residuals_Manual - lm_mean) / lm_sd

# Calculate normal p values
data.predict$Pvalue <- pnorm(data.predict$Residuals_Z_scaled_to_lm, lower.tail = FALSE)
#data.predict$FDR <- p.adjust(data.predict$Pvalue, method = "BH")
# saving data.predict as a supp table

data.predict_supp <- data.predict

# Cleaning up the data
rownames(data.predict_supp) <- NULL
data.predict_supp$Color <- NULL
colnames(data.predict_supp)[5] <- "Amp_group"
data.predict_supp$Pass_DNA_count <- NULL
data.predict_supp$Sign <- NULL
data.predict_supp$GC_above_mean <- NULL
data.predict_supp$Perfect_In_silico_specificity <- NULL
data.predict_supp$Amp_name2 <- NULL
data.predict_supp$At_least_1_specific_in_sil_prod <- NULL
data.predict_supp$Amp_Full_Name <- NULL
data.predict_supp$Pass_MeanRatio_SD <- NULL
data.predict_supp$Pass_prop_DNA_and_RNA <- NULL
data.predict_supp$Passed_QC <- NULL
data.predict_supp$Passed_QC_with_SD <- NULL
colnames(data.predict_supp)[43] <-  "Act_Rank"
colnames(data.predict_supp)[45] <- "lm_predicted_log2_RNA_prop"
colnames(data.predict_supp)[46] <- "lwr_95_conf_int"
colnames(data.predict_supp)[47] <- "upr_95_conf_int"


head(data.predict_supp)
colnames(data.predict_supp)

#write.csv(data.predict_supp, file = "G:/Shared drives/Nord Lab - Temp #Overflow/STARR-408_Lambert&Su-Feher/biorxiv submission/20210412/Supp_Tables/lm_308_data.csv")

8. Wilcoxon rank sum test

Wilcoxon rank sum test was considered as an alternative and non-parametric method for testing MPRA activity

8.1 Activity heatmaps

8.1.1 Rank heatmap

### Actiivty rank pheatmap
library(pheatmap)

df_act <- data.frame(
  "L1_rank" = rank(-data.predict$L1_Activity),
  "L2_rank" = rank(-data.predict$L2_Activity),
  "L3_rank" = rank(-data.predict$L3_Activity),
  "L4_rank" = rank(-data.predict$L4_Activity)
  )

# Ordering rows by residuals - not used for anything 
df_act$Amp_name <- data.predict$Amp_name
df_act$Mean_act <- data.predict$Residuals_Z_scaled_to_lm
df_act <- arrange(df_act, Mean_act)

rownames(df_act) <- df_act$Amp_name

df_act$Amp_name <- NULL
df_act$Mean_act <- NULL

pheatmap(df_act, cluster_rows = T, cluster_cols = F, show_rownames = F, main = "Ratiometric activity ranks. High rank = High activity")
Heatmap of ratiometric activity ranks. Low rank indicate high activity

Heatmap of ratiometric activity ranks. Low rank indicate high activity

8.1.2 log2 ratiometric activity heatmap

### Activity pheatmap
df_act <- data.frame(
  "L1_log2_act" = log2(data.predict$L1_Activity),
  "L2_log2_act" = log2(data.predict$L2_Activity),
  "L3_log2_act" = log2(data.predict$L3_Activity),
  "L4_log2_act" = log2(data.predict$L4_Activity)
  )

pheatmap(df_act, cluster_rows = T, show_rownames = F, cluster_cols = F, main = "log2 ratiometric activity")
Heatmap of log2 ratiometric activity values.

Heatmap of log2 ratiometric activity values.

8.1.3 log2 proportions heatmap

### Proportions heatmap

pheatmap(log2(data.predict[,20:27]), cluster_cols =  FALSE, show_rownames = FALSE, main = "log2 proportions")
Heatmap of log2 DNA and log2 RNA proportions

Heatmap of log2 DNA and log2 RNA proportions

8.2 Wilcoxon rank sum test

# AKA Mann–Whitney U test, Mann–Whitney–Wilcoxon (MWW), Wilcoxon–Mann–Whitney test

# Wilcoxon rank sum test aka. Mann–Whitney–Wilcoxon (MWW),
df <- data.predict[, c("L1_DNA_prop", "L2_DNA_prop", "L3_DNA_prop", "L4_DNA_prop", 
                       "L1_RNA_prop", "L2_RNA_prop", "L3_RNA_prop", "L4_RNA_prop")]


wt_results <- rbindlist(lapply(1:nrow(df), function(x) FUN = {

  wt <- wilcox.test(y = as.numeric(df[x,1:4]), x = as.numeric(df[x,5:8]), 
            paired = FALSE, 
            exact = TRUE,
            correct = TRUE,
            conf.int = TRUE, 
            
            alternative = "greater")

  wt_data <- data.frame("P_MWW" = wt$p.value, "conf_low" = wt$conf.int[1], "conf_high"= wt$conf.int[2])

}
))  

# table(wt_results$p)

 data.predict_MWW <- cbind(data.predict, wt_results)
 data.predict_MWW$MWW_sig <- ifelse(data.predict_MWW$P_MWW < 0.05, "Wilcoxon Rank Sum P < 0.05", "Non-significant")
 
 data.predict_MWW$FDR <- p.adjust(data.predict_MWW$P_MWW, method = "BH")
 
 #sum(p.adjust(data.predict_MWW$P_MWW, method = "BH") < 0.1) # 0
 #sum(p.adjust(data.predict_MWW$P_MWW, method = "BH") < 0.11) # 43
 #sum(p.adjust(data.predict_MWW$P_MWW, method = "BH") < 0.2) # 48
 
 data.predict_MWW$MWW_sig <- ifelse(data.predict_MWW$P_MWW < 0.05, 
                                    "MWW P < 0.05", "Non-significant")
 
 data.predict_MWW$MWW_sig <- ifelse(data.predict_MWW$FDR < 0.2, 
                                    "MWW FDR < 0.2", data.predict_MWW$MWW_sig)
 
 data.predict_MWW$MWW_sig <- ifelse(data.predict_MWW$FDR < 0.11, 
                                    "MWW FDR < 0.11", data.predict_MWW$MWW_sig)
 
####
formula <- y ~ x

p1 <- ggplot(data.predict_MWW, aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean)))+
  geom_point(alpha = 0.5, aes(color = MWW_sig))+
  scale_color_manual(values = c("red", "darkgreen", "grey")) + 
  labs(title = "Mean log2 DNA vs RNA prop.", 
       x = bquote(log[2](proportion[DNA])), 
       y = bquote(log[2](proportion[RNA])))+
  geom_smooth(formula = formula, method = 'lm', color = "black")+
  #stat_poly_eq(formula = formula, parse = TRUE, size = 4)+
  coord_fixed()+ 
  theme_bw()+ 
  theme(text = element_text(size = 8), 
        plot.title = element_text(size = 8))+ 
  theme(plot.title = element_text(hjust = 0.5), 
        legend.title = element_blank(),
        legend.margin = margin(0,0,0,-10))


p1
Wilcoxon rank sum test, one-sided, unpaired.

Wilcoxon rank sum test, one-sided, unpaired.

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/MWW_P.svg", plot = p1, dpi = 300, width = 4, height = 7)

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/MWW_P.png", plot = p1, dpi = 300, width = 4, height = 7)

8.3 Wilcoxon rank sum vs lm

# Venn diagram
library(Vennerable)
# 
P_norm_amp <- dplyr::filter(data.predict_MWW, Pvalue < 0.05)$Amp_name
P_MWW_amp <- dplyr::filter(data.predict_MWW, P_MWW < 0.05)$Amp_name

ll <- list(P_norm_amp, P_MWW_amp)
llv <- Venn(ll, SetNames = c("lm P < 0.05 amp.", "MWW P < 0.05 amp."))

plot(llv, doWeights = TRUE, type = "circles")
lm and Wilcoxon rank sum test demonstrate substantial intersection in amplicons passing P < 0.05

lm and Wilcoxon rank sum test demonstrate substantial intersection in amplicons passing P < 0.05

#write.csv(y, file = "G:/Shared drives/Nord Lab - Computational Projects/MPRA/STAR408/data_complete_408.csv")

#write.csv(data.predict, file = "G:/Shared drives/Nord Lab - Computational Projects/MPRA/STAR408/data_predict_308.csv")
# Move down
setdiff(
  dplyr::filter(data.predict, Pvalue < 0.05)$Amp_name, 
  dplyr::filter(data.predict, MeanRatio > 1.5, MSD > 0)$Amp_name
  )


setdiff(
  dplyr::filter(data.predict, MeanRatio > 1.5, MSD > 0)$Amp_name,
  dplyr::filter(data.predict, Pvalue < 0.05)$Amp_name
  )

amp_lm_model <- dplyr::filter(data.predict, Pvalue < 0.05)$Amp_name
amp_ratio_model <- dplyr::filter(data.predict, MeanRatio > 1.5, MSD > 0)$Amp_name

amp_lm_model # 41 amplicons
amp_ratio_model # 71 amplicons

setdiff(amp_lm_model, amp_ratio_model) # 9 different amplicons

(41 - 9) / 41

32/41

#############

All amplicons identified using model residuals were also active in the ratiometric comparison.

9. MPRA summary table

# Coordinates are in GRCh38

datatable(
  arrange(data.predict, 
          desc(Residuals_Z_scaled_to_lm))[,c("Amp_name", "Chr", "Start", "End", "MeanRatio", "MeanRatio_sd", "training", "Residuals_Z_scaled_to_lm", "Pvalue")], 
  rownames = FALSE)

10. Manuscript figures

10.1 Fig. 1B

#

## FIGURE 1C with data.predict

# This is effectively replacing data.predict with data.clean)
fig.1c <- data.predict

# Significance annotation and ordering
fig.1c$Sig <- ifelse(fig.1c$Pvalue >= 0.05, "Non-Significant", "P < 0.05,  Res. > 0")
fig.1c$Sig <- factor(fig.1c$Sig, levels = c("P < 0.05,  Res. > 0", "Non-Significant"))


p2 <- ggplot()+ 
 geom_point(data = dplyr::filter(fig.1c, Pvalue > 0.05), 
               aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean), color = Sig), 
               alpha = 0.7, size = 1, shape = 16)+ 
 geom_point(data = dplyr::filter(fig.1c, Pvalue < 0.05), 
            aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean), color = Sig), 
            alpha = 0.7, size = 1, shape = 16)+

 scale_color_manual(values = c("Non-Significant" = "grey",
                               "P < 0.05,  Res. > 0" = "red"))+    
    
  labs(title = "Mean Amplicon Correlation", color = NULL,
       x = bquote(log[2]~mean(proportion[DNA])), 
       y = bquote(log[2]~mean(proportion[RNA]))) +
  lims(x = c(-15,-5), y = c(-15, -5)) +
  coord_fixed() + 
  guides(color = guide_legend(ncol = 1, override.aes = list(size = 2))) + 
  theme_bw() + theme(text = element_text(size = 8), 
                     plot.title = element_text(size = 8, hjust = 0.5), 
                     axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
                     legend.position = "right")+
  theme(panel.grid.minor = element_blank())

 p2    
Fig.1B Correlation of mean RNA/DNA ratio in the assay showing amplicons tested by the multiple linear model (N = 308). Amplicons with significantly (P < 0.05) increased model residual value (Res.) in RNA compared to DNA are shown in red.

Fig.1B Correlation of mean RNA/DNA ratio in the assay showing amplicons tested by the multiple linear model (N = 308). Amplicons with significantly (P < 0.05) increased model residual value (Res.) in RNA compared to DNA are shown in red.

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft Figures/Figures_eLIFE_KC//FIG1D_CleanedData_by_P_and_Residual_KCv2_3_30_2021.png", plot = p2, dpi = 300, width = 3*1.5, height = 5*1.5)

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft Figures/Figures_eLIFE_KC/FIG1D_CleanedData_by_P_and_Residual_KCv2_3_30_2021.svg", plot = p2, dpi = 300, width = 3*1.5, height = 5*1.5)

 #

#library(gridExtra)

#grid.arrange(p1+theme(legend.position = "bottom"), 
#             p2+theme(legend.position = "bottom"), 
#             nrow = 1)

10.2 Fig. S2C

# Left

p1 <- ggplot() + 
  geom_point(data = data.all[! data.all$Amp_name %in% 
                               data.predict$Amp_name[data.predict$Pvalue < 0.05],], 
             
             aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean), color = "Non-Significant"), 
             alpha = 0.3, size = 1, shape = 16)+
  
  geom_point(data = data.all[data.all$Amp_name %in% 
                               data.predict$Amp_name[data.predict$Pvalue < 0.05],], 
             
             aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean), color = "Significant"),
             alpha = 0.5, size = 1, shape = 16)+
  
  scale_color_manual(values = c("Significant" = "red", 
                                "Non-Significant" = "grey50")) + 
  labs(title = "Mean Amplicon Correlation", color = NULL,
       x = bquote(log[2]~mean(proportion[DNA])), 
       y = bquote(log[2]~mean(proportion[RNA]))) +
  geom_smooth(data = data.for.lm, aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean)), 
              formula = y ~ x, method = 'lm', se = F, color = "black", size = 0.5, linetype = 2, fullrange = T)+
  # lims(x = c(-15,-5), y = c(-20, -5)) + 
  coord_fixed() + 
  guides(color = guide_legend(ncol = 1, override.aes = list(size = 2))) + 
  theme_bw() + theme(text = element_text(size = 8), 
                     plot.title = element_text(size = 8, hjust = 0.5), 
                     axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
                     legend.position = "right")+
   theme(panel.grid.minor = element_blank())


p1
Fig.S2C Correlation of mean RNA/DNA ratio in the assay as shown in Figure 1B, here showing data for all target amplicons (n = 408). Dashed line represents RNA/DNA best-fit line of the data (n = 248). Amplicons are colored by whether they were found significant (P < 0.05) using the multiple linear model.

Fig.S2C Correlation of mean RNA/DNA ratio in the assay as shown in Figure 1B, here showing data for all target amplicons (n = 408). Dashed line represents RNA/DNA best-fit line of the data (n = 248). Amplicons are colored by whether they were found significant (P < 0.05) using the multiple linear model.

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft Figures/Figures_eLIFE_KC/FIG1D_AllData_with_lm_KC_3_30_2021.svg", plot = p1, dpi = 300, width = 3, height = 5)

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft Figures/Figures_eLIFE_KC/FIG1D_AllData_with_lm_KC_3_30_2021.png", plot = p1, dpi = 300, width = 3, height = 5)

10.3 Fig. S3A-D

library(gridExtra)

eq_text_size = 1.7

S3A <- ggplot(data.predict, aes(x = log2(Maxi_prop), y = GC))+
  geom_point(size = 0.5, alpha = 0.5)+
  geom_smooth(formula = formula, method = 'lm', color = "black")+ 
  stat_poly_eq(formula = formula, label.y = "top", label.x = "left",
               aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
               parse = TRUE, size = eq_text_size)+
  scale_color_gradient2(midpoint = mean(data.predict$GC), low="blue3", mid="white",
                        high="red3", space ="Lab" )+
  labs(title = "Maxiprep vs GC content", 
       x = bquote(log[2](prop[Maxiprep])), 
       y = "Amplicon GC Content")+ 
  theme_bw()+ 
  theme(text = element_text(size = 8), 
        plot.title = element_text(size = 7, hjust = 0.5), aspect.ratio = 1) 


S3B <- ggplot(data.predict[order(data.predict$GC, decreasing = F),], 
              aes(x = log2(DNA_prop_mean), y = GC))+
  geom_point(size = 0.5, alpha = 0.5)+
  geom_smooth(formula = formula, method = 'lm', color = "black")+
  #stat_poly_eq(formula = formula, parse = TRUE, eq.with.lhs = TRUE)+
  stat_poly_eq(formula = formula, 
               aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
               parse = TRUE, size = eq_text_size)+ 
  labs(title = "DNA proportion vs GC content", 
       x = bquote(log[2](mean(prop[DNA]))), 
       y = "Amplicon GC Content") + 
  theme_bw()+ 
  theme(text = element_text(size = 8), 
        plot.title = element_text(size = 7, hjust = 0.5), aspect.ratio = 1)


S3C <- ggplot(data.predict, aes(x = log2(RNA_prop_mean), y = GC))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(formula = formula, method = 'lm', color = "black")+
  #stat_poly_eq(formula = formula, parse = TRUE, eq.with.lhs = TRUE)+
  stat_poly_eq(formula = formula, 
               aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
               parse = TRUE, size = eq_text_size)+ 
  labs(title = "RNA proportion vs GC content", 
       x = bquote(log[2](mean(prop[RNA]))), 
       y = "Amplicon GC Content") + 
  theme_bw()+ 
  theme(text = element_text(size = 8), 
        plot.title = element_text(size = 7, hjust = 0.5), aspect.ratio = 1)


S3D <- ggplot(data.predict, aes(x = log2(MeanRatio_MoM), y = GC))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(formula = formula, method = 'lm', color = "black")+
  #stat_poly_eq(formula = formula, parse = TRUE, eq.with.lhs = TRUE)+
  stat_poly_eq(formula = formula, 
               aes(label = paste(..rr.label.., ..eq.label.., sep = "~~~")), 
               parse = TRUE, size = eq_text_size)+ 
  labs(title = "Mean Ratio vs GC content", 
       x = bquote(log[2](mean(Activity))), 
       y = "Amplicon GC Content")+ 
  theme_bw()+ 
  theme(text = element_text(size = 8), 
        plot.title = element_text(size = 7, hjust = 0.5), aspect.ratio = 1)


Fig_S3 <- grid.arrange(S3A, S3B, S3C, S3D) 
Supplemental Figure 3. Amplicon selection for linear model building. (A) GC content per amplicon by previral library maxiprep proportion. (B) GC content per amplicon by mean DNA proportion. (C) GC content per amplicon by mean RNA proportion. (D) GC content per amplicon by mean RNA/DNA ratio. Line represents linear best-fit line for all plots. Shaded area represents standard error. GC content is correlated with the previral library and the DNA samples, but not to RNA or ratiometric activity.

Supplemental Figure 3. Amplicon selection for linear model building. (A) GC content per amplicon by previral library maxiprep proportion. (B) GC content per amplicon by mean DNA proportion. (C) GC content per amplicon by mean RNA proportion. (D) GC content per amplicon by mean RNA/DNA ratio. Line represents linear best-fit line for all plots. Shaded area represents standard error. GC content is correlated with the previral library and the DNA samples, but not to RNA or ratiometric activity.

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/Fig2S.png", plot = Fig_S2, device = "png", 
#       dpi = 300, width = 3.5, height = 3.5)


#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/Fig2S.svg", plot = Fig_S2, device = "svg", 
#       dpi = 300, width = 3.5, height = 3.5)

10.4 Fig. S3EFG+ extra panels

#  fig.cap = "These two panels didn't make it into the paper. (left) Amplicons excluded from the dataset for having less than 200 DNA or RNA counts are colored in red. Grey area represents amplicons with low proportions (< 2-10). (right) Distribution of standardized residuals is largely normally distributed. Blue distribution represents linear model training set (n = 248, p = 0.2, Kolmogorov-Smirnov test). Red distribution represents modeled data set (n = 308 p = 0.2, Kolmogorov-Smirnov test). Dashed red line represents normal distribution of mean 0 and standard deviation of 1." 

## plot amplicons flagged for removal 
d <- data.frame(x1 = c(-Inf, -Inf), x2 = c(+Inf, -15), y1 = c(-Inf,-15), y2 = c(-15, +Inf))

S3A <- ggplot()+
  geom_point(data = data.all[order(data.all$Passed_QC, decreasing = T),], 
             aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean), color = Passed_QC), 
             size = 1, alpha = 0.5, shape = 16)+
  geom_rect(data = d, mapping = aes(xmin = x1, xmax = x2, ymin = y1, ymax = y2), 
            color = NA, alpha = 0.2) + 
  scale_color_manual(values = c("red", "black")) +
  labs(title = "Flagged for low DNA or RNA counts", color = NULL, 
       x = bquote(log[2](mean(prop[DNA]))), 
       y = bquote(log[2](mean(prop[RNA])))) +
  theme_bw()+ 
  theme(text = element_text(size = 6), aspect.ratio = 1, 
        plot.title = element_text(size = 7, hjust = 0.5), 
        axis.text = element_text(hjust = 1, vjust = 1),
        legend.position = "none", legend.margin = margin(0,0,0,-10),
        panel.grid.minor = element_blank())


S3A


S3B <- ggplot()+
  geom_point(data = data.clean, 
             aes(x = log2(DNA_prop_mean), y = log2(RNA_prop_mean), color = training), 
             size = 1, alpha = 0.5, shape = 16)+
  scale_color_manual(values = c("red", "black"))+
  labs(title = "Removed from LM Training", color = NULL, 
       x = bquote(log[2](mean(prop[DNA]))), 
       y = bquote(log[2](mean(prop[RNA])))) +
  theme_bw()+ 
  theme(text = element_text(size = 6), aspect.ratio = 1, 
        plot.title = element_text(size = 7, hjust = 0.5), 
        axis.text = element_text(hjust = 1, vjust = 1),
        legend.position = "none", legend.margin = margin(0,0,0,-10),
        panel.grid.minor = element_blank())

S3B 

# convert mod (lm) to fortify (data.frame)
mod_training_with_GC.fort <- fortify(mod_training_with_GC)

# make a normal distribution with s.d. = 1
r_norm <- rnorm(n = 100000, sd = 1)

# Training - Raw
S3C <- ggplot()+
  geom_density(data = dplyr::filter(data.predict, training == TRUE), 
               aes(x = Residuals_Manual, fill = training), fill = "blue", alpha = 0.3)+
  geom_density(data = data.predict, 
               aes(x = Residuals_Manual, fill = training), fill = "red", alpha = 0.3)+
  geom_density(data = data.frame("r_norm" = r_norm), 
               aes(x = r_norm), color = "red", linetype = 2, alpha = 0)+
  labs(x = "Residuals", y = "Density", title = "Distribution of Residuals")+ 
  theme_bw()+ 
  theme(text = element_text(size = 6), aspect.ratio = 1, 
        plot.title = element_text(size = 7, hjust = 0.5), 
        axis.text = element_text(hjust = 1, vjust = 1),
        legend.position = "none", legend.margin = margin(0,0,0,-10),
        panel.grid.minor = element_blank())


S3C

#shapiro.test(dplyr::filter(data.predict, training == TRUE)$Residuals_Manual)
#shapiro.test(data.predict$Residuals_Manual)

set.seed(1234)
#ks.test(dplyr::filter(data.predict, training == TRUE)$Residuals_Manual, rnorm(10^6))

set.seed(1234)
#ks.test(data.predict$Residuals_Manual, rnorm(10^6))



# Fig. caption = Distribution of standardized residuals is largely normally distributed. Blue, training data set (n = 248); Red, modeled data set (n = 308); Red dashed line, normal distribution of mean 0 and 1 SD.

S3D <- ggplot(data.predict, aes(x = fit, y = Residuals_Manual)) + 
  geom_point(data = data.predict[data.predict$Pvalue >= 0.05,],
             aes(color = "Non-Significant"),
             alpha = 0.3, size = 1, shape = 16)+
  geom_point(data = data.predict[data.predict$Pvalue < 0.05 & data.predict$Residuals_Manual > 0,],
             aes(color = "Significant (resd. > 0)"),
             alpha = 0.5, size = 1, shape = 16)+
 
  scale_color_manual(values = c("Significant (resd. > 0)" = "red",
                                "Non-Significant" = "grey50"))+
  labs(title = "Model Residuals by Fitted Values", color = NULL,
       x = bquote(Fitted~log[2]~(proportion[RNA])), 
       y = bquote(Residual))+
  guides(color = guide_legend(ncol = 1, override.aes = list(size = 2)))+ 
  theme_bw()+ 
  theme(text = element_text(size = 6), aspect.ratio = 1, 
        plot.title = element_text(size = 7, hjust = 0.5), 
        axis.text = element_text(hjust = 1, vjust = 1),
        legend.position = "none", legend.margin = margin(0,0,0,-10),
        panel.grid.minor = element_blank())

S3D
    
# Fig. caption = Residuals by fitted log2 mean RNA proportion. Amplicons passing significant (pnorm < 0.05) are colored.


S3E <- summary(mod_training_with_GC)

  
# Fig. caption = Linear model summary. GC content is a significant covariate (p = 1.42e-08).  

#Fig_S3 <- grid.arrange(S3A, S3B, S3C, S3D) 

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/Fig3S.png", plot = Fig_S3, device = "png", 
#       dpi = 300, width = 3.5, height = 3.5)

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/Fig3S.svg", plot = Fig_S3, device = "svg", 
#       dpi = 300, width = 5, height = 5
Fig. S3EFG+ extra panels (X) Amplicons excluded from the dataset for having less than 200 DNA or RNA counts are colored in red. Grey area represents amplicons with low proportions (< 2-10). (E) Removal of the top 10% and bottom 10% of amplicons from linear model building. (X) Distribution of standardized residuals is largely normally distributed. Blue distribution represents linear model training set (n = 248, p = 0.2, Kolmogorov-Smirnov test). Red distribution represents modeled data set (n = 308 p = 0.2, Kolmogorov-Smirnov test). Dashed red line represents normal distribution of mean 0 and standard deviation of 1. (F) Residuals by fitted log2 RNA proportion. Amplicons passing significant (pnorm < 0.05) are colored. (G) Linear model summary. GC content is considered a significant covariate (p = 1.42e-08, t-test).Fig. S3EFG+ extra panels (X) Amplicons excluded from the dataset for having less than 200 DNA or RNA counts are colored in red. Grey area represents amplicons with low proportions (< 2-10). (E) Removal of the top 10% and bottom 10% of amplicons from linear model building. (X) Distribution of standardized residuals is largely normally distributed. Blue distribution represents linear model training set (n = 248, p = 0.2, Kolmogorov-Smirnov test). Red distribution represents modeled data set (n = 308 p = 0.2, Kolmogorov-Smirnov test). Dashed red line represents normal distribution of mean 0 and standard deviation of 1. (F) Residuals by fitted log2 RNA proportion. Amplicons passing significant (pnorm < 0.05) are colored. (G) Linear model summary. GC content is considered a significant covariate (p = 1.42e-08, t-test).Fig. S3EFG+ extra panels (X) Amplicons excluded from the dataset for having less than 200 DNA or RNA counts are colored in red. Grey area represents amplicons with low proportions (< 2-10). (E) Removal of the top 10% and bottom 10% of amplicons from linear model building. (X) Distribution of standardized residuals is largely normally distributed. Blue distribution represents linear model training set (n = 248, p = 0.2, Kolmogorov-Smirnov test). Red distribution represents modeled data set (n = 308 p = 0.2, Kolmogorov-Smirnov test). Dashed red line represents normal distribution of mean 0 and standard deviation of 1. (F) Residuals by fitted log2 RNA proportion. Amplicons passing significant (pnorm < 0.05) are colored. (G) Linear model summary. GC content is considered a significant covariate (p = 1.42e-08, t-test).Fig. S3EFG+ extra panels (X) Amplicons excluded from the dataset for having less than 200 DNA or RNA counts are colored in red. Grey area represents amplicons with low proportions (< 2-10). (E) Removal of the top 10% and bottom 10% of amplicons from linear model building. (X) Distribution of standardized residuals is largely normally distributed. Blue distribution represents linear model training set (n = 248, p = 0.2, Kolmogorov-Smirnov test). Red distribution represents modeled data set (n = 308 p = 0.2, Kolmogorov-Smirnov test). Dashed red line represents normal distribution of mean 0 and standard deviation of 1. (F) Residuals by fitted log2 RNA proportion. Amplicons passing significant (pnorm < 0.05) are colored. (G) Linear model summary. GC content is considered a significant covariate (p = 1.42e-08, t-test).

Fig. S3EFG+ extra panels (X) Amplicons excluded from the dataset for having less than 200 DNA or RNA counts are colored in red. Grey area represents amplicons with low proportions (< 2-10). (E) Removal of the top 10% and bottom 10% of amplicons from linear model building. (X) Distribution of standardized residuals is largely normally distributed. Blue distribution represents linear model training set (n = 248, p = 0.2, Kolmogorov-Smirnov test). Red distribution represents modeled data set (n = 308 p = 0.2, Kolmogorov-Smirnov test). Dashed red line represents normal distribution of mean 0 and standard deviation of 1. (F) Residuals by fitted log2 RNA proportion. Amplicons passing significant (pnorm < 0.05) are colored. (G) Linear model summary. GC content is considered a significant covariate (p = 1.42e-08, t-test).

10.5 Fig. S2AB

# I must edit the Amp_names to match amp.prop.plot and data.predict

amp.prop.plot$Amp_name <- gsub("NCASD", "FBDHS", amp.prop.plot$Amp_name)
amp.prop.plot$Color2 <- gsub("NCASD", "FBDHS", amp.prop.plot$Color2)
amp.prop.plot$Amp_name <- gsub("Epilepsy", "EPIL", amp.prop.plot$Amp_name)

amp.prop.plot$Color2 <- gsub("Epilepsy", "EPIL", amp.prop.plot$Color2)
amp.prop.plot$Amp_name <- gsub("Control", "PutEnh", amp.prop.plot$Amp_name)

amp.prop.plot$Color2 <- gsub("Control", "PutEnh", amp.prop.plot$Color2)
amp.prop.plot$Amp_name <- gsub("SZ108", "SCZ", amp.prop.plot$Amp_name)

amp.prop.plot$Color2 <- gsub("SZ108", "SCZ", amp.prop.plot$Color2)

amp.prop.plot$Amp_Full_Name <- amp.prop.plot$Amp_name
amp.prop.plot$Amp_name <- gsub("EPIL|SCZ", "GWAS", amp.prop.plot$Amp_name)

amp.prop.plot$Color2 <- gsub("EPIL|SCZ", "GWAS", amp.prop.plot$Color2)
amp.prop.plot$Amp_name <- gsub("SCN1A|CACNA1C", "LD", amp.prop.plot$Amp_name)

amp.prop.plot$Color2 <- gsub("SCN1A|CACNA1C", "LD", amp.prop.plot$Color2)



# DNA correlations - capped at 2e-10
DNA_prop_cor_plot <- log2(as.matrix(dplyr::filter(amp.prop.plot, Amp_name %in% data.predict$Amp_name)[,c("L1-DNA", "L2-DNA", "L3-DNA", "L4-DNA", "Maxiprep")]))

DNA_prop_cor_plot <- as.data.frame(ifelse(as.matrix(DNA_prop_cor_plot) < -10, -10, as.matrix(DNA_prop_cor_plot)))

#DNA_prop_cor_plot <- as.data.frame(DNA_prop_cor_plot)

p <- ggpairs(DNA_prop_cor_plot , 
        progress = FALSE,
        lower = list(continuous = wrap("points", size = 0.5, alpha = 0.5, pch = 16), 
                     combo = "box", discrete = "blank", na = "na"),
        diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), 
                    na = "blankDiag"), 
        upper = list(continuous = wrap('cor', size = 4)))+
  
        #labs(title = bquote(gDNA~and~Maxiprep~by~log[2](Proportion))) + 
        theme_bw()+ 
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
              text = element_text(size = 10), 
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

p


#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/FIG1B_DNAProportionCorrelation_capped.png", 
#       plot = p, dpi = 300, width = 6, height = 6)

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/FIG1B_DNAProportionCorrelation_capped.svg", 
#       plot = p, dpi = 300, width = 6, height = 6)


# Prop < 2e-10 removed
DNA_prop_cor_plot <- log2(as.matrix(dplyr::filter(amp.prop.plot, Amp_name %in% data.predict$Amp_name)[,c("L1-DNA", "L2-DNA", "L3-DNA", "L4-DNA", "Maxiprep")]))

DNA_prop_cor_plot <- as.data.frame(ifelse(as.matrix(DNA_prop_cor_plot) < -10, NA, as.matrix(DNA_prop_cor_plot)))

#DNA_prop_cor_plot <- as.data.frame(DNA_prop_cor_plot)

p <- ggpairs(DNA_prop_cor_plot , 
        progress = FALSE,
        lower = list(continuous = wrap("points", size = 0.5, alpha = 0.5, pch = 16), 
                     combo = "box", discrete = "blank", na = "na"),
        diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), 
                    na = "blankDiag"), 
        upper = list(continuous = wrap('cor', size = 4)))+
  
        #labs(title = bquote(gDNA~and~Maxiprep~by~log[2](Proportion))) + 
        theme_bw()+ 
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
              text = element_text(size = 10), 
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

p

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/FIG1B_DNAProportionCorrelation_removed.png", 
#       plot = p, dpi = 300, width = 6, height = 6)

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/FIG1B_DNAProportionCorrelation_removed.svg", 
#       plot = p, dpi = 300, width = 6, height = 6)

#colSums(!is.na(as.matrix(DNA_prop_cor_plot)))


# RNA correlations - capped
RNA_prop_cor_plot <- log2(as.matrix(dplyr::filter(amp.prop.plot, Amp_name %in% data.predict$Amp_name)[,c("L1-RNA", "L2-RNA", "L3-RNA", "L4-RNA", "L4-RNA-35")]))
  
RNA_prop_cor_plot <- as.data.frame(ifelse(RNA_prop_cor_plot < -10, -10, RNA_prop_cor_plot))


p <- ggpairs(RNA_prop_cor_plot, progress = FALSE,
             lower = list(continuous = wrap("points", size = 0.5, alpha = 0.5, pch = 16), 
                          combo = "box", discrete = "blank", na = "na"),
             diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), 
                         na = "blankDiag"), 
             upper = list(continuous = wrap('cor', size = 4))) + 
  #labs(title = bquote(cDNA~by~log[2](Proportion))) + 
  theme_bw()+ 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
        text = element_text(size = 10), 
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

p

### Work zone

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/FIG1B_RNAProportionCorrelation_capped.png", 
#       plot = p, dpi = 300, width = 6, height = 6)

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/FIG1B_RNAProportionCorrelation_capped.svg", 
#       plot = p, dpi = 300, width = 6, height = 6)


# RNA correlations - 2-10 removed
RNA_prop_cor_plot <- log2(as.matrix(dplyr::filter(amp.prop.plot, Amp_name %in% data.predict$Amp_name)[,c("L1-RNA", "L2-RNA", "L3-RNA", "L4-RNA", "L4-RNA-35")]))
  
RNA_prop_cor_plot <- as.data.frame(ifelse(RNA_prop_cor_plot < -10, NA, RNA_prop_cor_plot))


p <- ggpairs(RNA_prop_cor_plot, progress = FALSE,
             lower = list(continuous = wrap("points", size = 0.5, alpha = 0.5, pch = 16), 
                          combo = "box", discrete = "blank", na = "na"),
             diag = list(discrete = "blankDiag", continuous = wrap("densityDiag", alpha = 0.4), 
                         na = "blankDiag"), 
             upper = list(continuous = wrap('cor', size = 4))) + 
  #labs(title = bquote(cDNA~by~log[2](Proportion))) + 
  theme_bw()+ 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
        text = element_text(size = 10), 
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

p

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/FIG1B_RNAProportionCorrelation_removed.png", 
#       plot = p, dpi = 300, width = 6, height = 6)

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/FIG1B_RNAProportionCorrelation_removed.svg", 
#       plot = p, dpi = 300, width = 6, height = 6)

#colSums(!is.na(as.matrix(RNA_prop_cor_plot)))
Fig. S2AB (A) Correlation of genomic DNA (“DNA”) representation across biological replicates and the previral plasmid library. Data is shown as log2(proportion) per amplicon. Pearson correlation is shown for each pairwise comparison over the filtered dataset (n = 308). (B) Correlation of cDNA (“RNA”) representation across biological replicates and a technical replicate of Sample 4 which was subjected to increased PCR cycles during sample preparation (35 cycles instead of the standard 30). Data is shown as log2(proportion) per amplicon. Pearson correlation is shown for each pairwise comparison over the filtered dataset (n = 308). Left panels demonstrate correlatins of 308 amplicons with capped proportions at 2^-10. Right panels demonstrate correlations of proportions > 2^-10Fig. S2AB (A) Correlation of genomic DNA (“DNA”) representation across biological replicates and the previral plasmid library. Data is shown as log2(proportion) per amplicon. Pearson correlation is shown for each pairwise comparison over the filtered dataset (n = 308). (B) Correlation of cDNA (“RNA”) representation across biological replicates and a technical replicate of Sample 4 which was subjected to increased PCR cycles during sample preparation (35 cycles instead of the standard 30). Data is shown as log2(proportion) per amplicon. Pearson correlation is shown for each pairwise comparison over the filtered dataset (n = 308). Left panels demonstrate correlatins of 308 amplicons with capped proportions at 2^-10. Right panels demonstrate correlations of proportions > 2^-10Fig. S2AB (A) Correlation of genomic DNA (“DNA”) representation across biological replicates and the previral plasmid library. Data is shown as log2(proportion) per amplicon. Pearson correlation is shown for each pairwise comparison over the filtered dataset (n = 308). (B) Correlation of cDNA (“RNA”) representation across biological replicates and a technical replicate of Sample 4 which was subjected to increased PCR cycles during sample preparation (35 cycles instead of the standard 30). Data is shown as log2(proportion) per amplicon. Pearson correlation is shown for each pairwise comparison over the filtered dataset (n = 308). Left panels demonstrate correlatins of 308 amplicons with capped proportions at 2^-10. Right panels demonstrate correlations of proportions > 2^-10Fig. S2AB (A) Correlation of genomic DNA (“DNA”) representation across biological replicates and the previral plasmid library. Data is shown as log2(proportion) per amplicon. Pearson correlation is shown for each pairwise comparison over the filtered dataset (n = 308). (B) Correlation of cDNA (“RNA”) representation across biological replicates and a technical replicate of Sample 4 which was subjected to increased PCR cycles during sample preparation (35 cycles instead of the standard 30). Data is shown as log2(proportion) per amplicon. Pearson correlation is shown for each pairwise comparison over the filtered dataset (n = 308). Left panels demonstrate correlatins of 308 amplicons with capped proportions at 2^-10. Right panels demonstrate correlations of proportions > 2^-10

Fig. S2AB (A) Correlation of genomic DNA (“DNA”) representation across biological replicates and the previral plasmid library. Data is shown as log2(proportion) per amplicon. Pearson correlation is shown for each pairwise comparison over the filtered dataset (n = 308). (B) Correlation of cDNA (“RNA”) representation across biological replicates and a technical replicate of Sample 4 which was subjected to increased PCR cycles during sample preparation (35 cycles instead of the standard 30). Data is shown as log2(proportion) per amplicon. Pearson correlation is shown for each pairwise comparison over the filtered dataset (n = 308). Left panels demonstrate correlatins of 308 amplicons with capped proportions at 2^-10. Right panels demonstrate correlations of proportions > 2^-10

10.6 Fig. 2A

# Fig 2a has summed activity ratio (cDNA/gDNA) as bars, points for buffered individual rep ratios, colors are grey and red
# Fig 2b has same info as in Fig 2a but with only top 18 shown
# Fig 2c-d has intersection of NS and S sets
# Fig S3 gets dropped, Fig S4 gets GC plots from current Fig S2.


## Fig2A:

fig.2a.bar <- data.predict

# DNA prop + RNA prop + DNA count  = PASSED, n = 219
#fig.2a.bar <- fig.2a.bar[fig.2a.bar$Passed_QC_with_SD == TRUE, ]

# Significance annotation
fig.2a.bar$Sig <- ifelse(fig.2a.bar$Pvalue >= 0.05, "Non-Significant", "P < 0.05")

# Reordering based on Residuals
fig.2a.bar <- fig.2a.bar[order(fig.2a.bar$Residuals_Manual, decreasing = F),]
fig.2a.bar$Amp_name <- factor(fig.2a.bar$Amp_name, levels = fig.2a.bar$Amp_name)

# lm model is defined based on MeanRatio_MoM. MeanRatio is a ration of the 4 activities

fig.2a.bar.melt <- melt(fig.2a.bar[,c("Amp_name", "L1_Activity", "L2_Activity", "L3_Activity", "L4_Activity", "Sig")], id.vars = c("Amp_name", "Sig"))


#head(fig.2a.bar.melt)

# Set hard limits
cap_value = 9

fig.2a.bar.melt$value <- ifelse(fig.2a.bar.melt$value < -cap_value, -cap_value,  fig.2a.bar.melt$value)
fig.2a.bar.melt$value <- ifelse(fig.2a.bar.melt$value > cap_value, cap_value,  fig.2a.bar.melt$value)

fig.2a.bar$MeanRatio <- fig.2a.bar$MeanRatio
fig.2a.bar$MeanRatio <- ifelse(fig.2a.bar$MeanRatio < -cap_value, -cap_value,  fig.2a.bar$MeanRatio)
fig.2a.bar$MeanRatio <- ifelse(fig.2a.bar$MeanRatio > cap_value, cap_value,  fig.2a.bar$MeanRatio)

fig.2a.bar$MeanRatio_MoM <- ifelse(fig.2a.bar$MeanRatio_MoM > cap_value, cap_value, fig.2a.bar$MeanRatio_MoM) 

# Order plot legend
fig.2a.bar$Sig <- factor(fig.2a.bar$Sig, levels = c("P < 0.05","Non-Significant"))

# Plot
p <- ggplot(fig.2a.bar)+
  geom_bar(data = fig.2a.bar, 
           aes(x = Amp_name, y = MeanRatio_MoM, fill = Sig), 
           stat = "identity", alpha = 0.75, width = 1)+
  geom_point(data = fig.2a.bar.melt,
             aes(x = Amp_name, y = value), 
             size = 0.3, alpha = 0.5, color = "black", shape = 16)+
  scale_y_continuous(breaks = c(0, 2, 4, 6, 8, 9),
                     labels=c("0" = "0", "2" = "2", "4" = "4", "6" = "6", "8" = "8", "9" = ">9"))+
    
  scale_fill_manual(values = c("Non-Significant" = "grey",
                               "P < 0.05" = "red"))+
    
  labs(x = "Amplicons",
       y = bquote(mean(RNA/DNA~Ratio)),
       fill = NULL,
       title = "Amplicons sorted by Model Residual")+ 
  theme_bw()+ 
  theme(text = element_text(size = 8), 
                     plot.title = element_text(size = 8, hjust = 0.5), 
                     axis.text.x = element_blank(), 
                     axis.ticks.x = element_blank(), 
                     # panel.background = element_rect(fill = "white"), 
                     panel.grid.major.x = element_blank(),
                     panel.grid.minor = element_blank(),
                     # panel.grid.major.y = element_line(size = 0.5),
                     legend.position = "right", legend.margin = margin(0,0,0,0))

p  

#ggsave("G:/Shared drives/Nord Lab - Temp #Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/Fig2A_Var_BarPlot_MeanRatio_with_Points_KC.svg", 
#       plot = p, dpi = 300, width = 6, height = 3)

#ggsave("G:/Shared drives/Nord Lab - Temp #Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/Fig2A_Var_BarPlot_MeanRatio_with_Points_KC.png",
#       plot = p, dpi = 300, width = 6, height = 3)



## 9.7 Fig. 2A - padded counts ## 


# Getting counts
df <- dplyr::filter(amp_counts, Color == "STAR408")[,c(4, 6:9, 12, 13, 14, 11)]
rownames(df) <- df$Amp_name
Amp_names <- df$Amp_name
df$Amp_name <- NULL

colnames(df) <- c("L1_DNA", "L2_DNA", "L3_DNA", "L4_DNA", 
                  "L1_RNA", "L2_RNA", "L3_RNA", "L4_RNA")

# Adds a constant count to control low-count variance.
df <- df+1000

amp.prop2 <- as.data.frame(apply(df, 2, function(x) { (x+1)/(sum(x, na.rm = T)+1) }))

# Adding a constant count value
amp.prop2$DNA_prop_mean <- rowMeans(amp.prop2[,c(1:4)])
amp.prop2$RNA_prop_mean <- rowMeans(amp.prop2[,c(5:8)])

amp.prop2 <- data.frame("Amp_name" = Amp_names, amp.prop2)

# Calculation of the ratiometric activity
L1_act <- amp.prop2$L1_RNA / amp.prop2$L1_DNA
L2_act <- amp.prop2$L2_RNA / amp.prop2$L2_DNA
L3_act <- amp.prop2$L3_RNA / amp.prop2$L3_DNA
L4_act <- amp.prop2$L4_RNA / amp.prop2$L4_DNA

amp.act2 <- data.frame("L1_Activity" = L1_act,
                      "L2_Activity" = L2_act,
                      "L3_Activity" = L3_act,
                      "L4_Activity" = L4_act)
                      

df <- data.frame(amp.prop2, amp.act2)
rownames(df) <- NULL

# Activity calculations
df$MeanRatio_MoM <- df$RNA_prop_mean / df$DNA_prop_mean

df$MeanRatio <- rowMeans(df[,c("L1_Activity", "L2_Activity", "L3_Activity", "L4_Activity")])

# Amp_name correction. Thank you for this mess Linda!
df$Amp_name <- gsub("NCASD", "FBDHS", df$Amp_name)
df$Amp_name <- gsub("Epilepsy", "EPIL", df$Amp_name)
df$Amp_name <- gsub("Control", "PutEnh", df$Amp_name)
df$Amp_name <- gsub("SZ108", "SCZ", df$Amp_name)
df$Amp_name <- gsub("EPIL|SCZ", "GWAS", df$Amp_name)
df$Amp_name <- gsub("SCN1A|CACNA1C", "LD", df$Amp_name)


# Filtering the set of 308
df <- dplyr::filter(df, Amp_name %in% data.predict$Amp_name)
df <- merge(df, data.predict[,c("Amp_name", "Residuals_Manual", "Pvalue")], by = "Amp_name")

# Significance annotation
df$Sig <- ifelse(df$Pvalue >= 0.05, "Non-Significant", "P < 0.05")

# Reordering based on Residuals
df <- df[order(df$Residuals_Manual, decreasing = F),]
df$Amp_name <- factor(df$Amp_name, levels = df$Amp_name)

df_plus_1000 <- df

## individual datapoints melted
df.melt <- melt(df[,c("Amp_name", "L1_Activity", "L2_Activity", "L3_Activity", "L4_Activity", "Sig")], id.vars = c("Amp_name", "Sig"))


#head(df.melt)

# Set hard limits
cap_value = 8

df.melt$value <- ifelse(df.melt$value < -cap_value, -cap_value,  df.melt$value)
df.melt$value <- ifelse(df.melt$value > cap_value, cap_value,  df.melt$value)


# 2A plot
p <- ggplot()+
  geom_bar(data = df, 
           aes(x = Amp_name, y = MeanRatio_MoM, fill = Sig), 
           stat = "identity", alpha = 0.75, width = 1)+

  geom_point(data = df.melt,
             aes(x = Amp_name, y = value), 
             size = 0.3, alpha = 0.5, color = "black", shape = 16)+
  scale_fill_manual(values = c("Non-Significant" = "grey",
                               "P < 0.05" = "red"))+
  scale_y_continuous(breaks = c(0, 2, 4, 6, 8),
                     labels=c("0" = "0", "2" = "2", "4" = "4", 
                              "6" = "6", "8" = ">8"))+
    
  labs(x = "Amplicons",
       y = bquote(mean(RNA/DNA~Ratio)),
       fill = NULL,
       title = "Amplicons sorted by Model Residual")+ 
  theme_bw()+ 
  theme(text = element_text(size = 8), 
        plot.title = element_text(size = 8, hjust = 0.5), 
        axis.text.x = element_blank(), 
        axis.ticks.x = element_blank(), 
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "right", legend.margin = margin(0,0,0,0))

p


#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/Fig2A.svg", 
#       plot = p, dpi = 300, width = 5, height = 3)
#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/Fig2A.png", 
#       plot = p, dpi = 300, width = 5, height = 3)
Fig. 2A. Bar plot representing distribution of activity in the test assay, sorted by linear model residuals. Amplicons are colored as in Figure 1B. To control low-count variability, 1000 counts were added to RNA and DNA counts to stabilize ratios where counts were low (right panel). n = 308 ampliconsFig. 2A. Bar plot representing distribution of activity in the test assay, sorted by linear model residuals. Amplicons are colored as in Figure 1B. To control low-count variability, 1000 counts were added to RNA and DNA counts to stabilize ratios where counts were low (right panel). n = 308 amplicons

Fig. 2A. Bar plot representing distribution of activity in the test assay, sorted by linear model residuals. Amplicons are colored as in Figure 1B. To control low-count variability, 1000 counts were added to RNA and DNA counts to stabilize ratios where counts were low (right panel). n = 308 amplicons

# Delete after manuscript acceptance.

## 9.8 Flag variable amplicons. This is just an exploratory approach.
#It didn't get to the paper. We decided to show all 308 amplicons in Fig. 2a.

# Getting counts
df <- dplyr::filter(amp_counts, Color == "STAR408")[,c(4, 6:9, 12, 13, 14, 11)]
rownames(df) <- df$Amp_name
Amp_names <- df$Amp_name
df$Amp_name <- NULL

colnames(df) <- c("L1_DNA", "L2_DNA", "L3_DNA", "L4_DNA", 
                  "L1_RNA", "L2_RNA", "L3_RNA", "L4_RNA")

# Adds a constant count to control low-count variance.
df <- df+1000  # No counts added to keep ratiometric activity "raw"

amp.prop2 <- as.data.frame(apply(df, 2, function(x) { (x+1)/(sum(x, na.rm = T)+1) }))

# Adding a constant count value
amp.prop2$DNA_prop_mean <- rowMeans(amp.prop2[,c(1:4)])
amp.prop2$RNA_prop_mean <- rowMeans(amp.prop2[,c(5:8)])

amp.prop2 <- data.frame("Amp_name" = Amp_names, amp.prop2)

# Calculation of the ratiometric activity
L1_act <- amp.prop2$L1_RNA / amp.prop2$L1_DNA
L2_act <- amp.prop2$L2_RNA / amp.prop2$L2_DNA
L3_act <- amp.prop2$L3_RNA / amp.prop2$L3_DNA
L4_act <- amp.prop2$L4_RNA / amp.prop2$L4_DNA

amp.act2 <- data.frame("L1_Activity" = L1_act,
                      "L2_Activity" = L2_act,
                      "L3_Activity" = L3_act,
                      "L4_Activity" = L4_act)
                      

df <- data.frame(amp.prop2, amp.act2)
rownames(df) <- NULL

# Activity calculations
df$MeanRatio_MoM <- df$RNA_prop_mean / df$DNA_prop_mean

df$MeanRatio <- rowMeans(df[,c("L1_Activity", "L2_Activity", "L3_Activity", "L4_Activity")])

# Amp_name correction. Thank you for this mess Linda!
df$Amp_name <- gsub("NCASD", "FBDHS", df$Amp_name)
df$Amp_name <- gsub("Epilepsy", "EPIL", df$Amp_name)
df$Amp_name <- gsub("Control", "PutEnh", df$Amp_name)
df$Amp_name <- gsub("SZ108", "SCZ", df$Amp_name)
df$Amp_name <- gsub("EPIL|SCZ", "GWAS", df$Amp_name)
df$Amp_name <- gsub("SCN1A|CACNA1C", "LD", df$Amp_name)


# Filtering the set of 308
df <- dplyr::filter(df, Amp_name %in% data.predict$Amp_name)
df <- merge(df, data.predict[,c("Amp_name", "Residuals_Manual", "Pvalue")], by = "Amp_name")

# Significance annotation
df$Sig <- ifelse(df$Pvalue >= 0.05, "Non-Significant", "P < 0.05")



# Reordering based on Residuals
df <- df[order(df$Residuals_Manual, decreasing = F),]
df$Amp_name <- factor(df$Amp_name, levels = df$Amp_name)


# Flags individual ratiometric activity greater than MeanRatio_MoM

upper_t = 3
lower_t = 0.33


df$L1_act_flag <- (df$L1_Activity / df$MeanRatio_MoM) > upper_t | (df$L1_Activity / df$MeanRatio_MoM) < lower_t
  
df$L2_act_flag <- (df$L2_Activity / df$MeanRatio_MoM) > upper_t | (df$L2_Activity / df$MeanRatio_MoM) < lower_t

df$L3_act_flag <- (df$L3_Activity / df$MeanRatio_MoM) > upper_t | (df$L3_Activity / df$MeanRatio_MoM) < lower_t

df$L4_act_flag <- (df$L4_Activity / df$MeanRatio_MoM) > upper_t | (df$L4_Activity / df$MeanRatio_MoM) < lower_t

# How many amplicons are flagged:
f_a = sum(rowSums(df[,21:24]) > 0) #132
f_L1 = sum(df$L1_act_flag == TRUE)
f_L2 = sum(df$L2_act_flag == TRUE)
f_L3 = sum(df$L3_act_flag == TRUE)
f_L4 = sum(df$L4_act_flag == TRUE)

flag_summary <- melt(data.frame("Flagged_all" = f_a,
           "Flagged_L1" = f_L1,
           "Flagged_L2" = f_L2,
           "Flagged_L3" = f_L3,
           "Flagged_L4" = f_L4))

colnames(flag_summary) <- NULL
flag_summary


# Modifies Sig column to mark flagged amplicons

df$Sig <- ifelse(rowSums(df[,21:24]) > 0, "Flagged", df$Sig)


## individual datapoints melted
df.melt <- melt(df[,c("Amp_name", "L1_Activity", "L2_Activity", "L3_Activity", "L4_Activity", "Sig")], 
    id.vars = c("Amp_name", "Sig"))

# melted flags
df.melt_flag <- melt(df[,c("Amp_name", "L1_act_flag", 
                       "L2_act_flag",
                       "L3_act_flag",
                       "L4_act_flag")], 
                     id.vars = c("Amp_name"))

df.melt_flag <- df.melt_flag[,c(1,3)]

colnames(df.melt_flag) <- c("Amp_name", "flag_value")

#df.melt <- merge(df.melt, df.melt_flag, by = "Amp_name")
all(df.melt$Amp_name == df.melt_flag$Amp_name)

df.melt$flag_value <- df.melt_flag$flag_value


# Set hard limits
cap_value = 12.5

df.melt$value <- ifelse(df.melt$value < -cap_value, -cap_value,  df.melt$value)
df.melt$value <- ifelse(df.melt$value > cap_value, cap_value,  df.melt$value)



# 2A plot

ggplot()+
  geom_bar(data = df, 
           aes(x = Amp_name, y = MeanRatio_MoM, fill = Sig), 
           stat = "identity", alpha = 0.75, width = 1)+

  geom_point(data = df.melt,
             aes(x = Amp_name, y = value, color = flag_value), 
             size = 0.8, alpha = 0.5, shape = 16)+
   scale_fill_manual(values = c("Non-Significant" = "grey", "P < 0.05" = "red", "Flagged" = "blue"))+
  scale_color_manual(values = c("TRUE" = "blue", "FALSE" = "black"))+
    
  labs(x = "Amplicons",
       y = bquote(mean(RNA/DNA~Ratio)),
       fill = NULL,
       title = "Amplicons sorted by Model Residual")+ 
  theme_bw()+ 
  theme(text = element_text(size = 12), 
        plot.title = element_text(size = 12, hjust = 0.5), 
        axis.text.x = element_blank(), 
        axis.ticks.x = element_blank(), 
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "right", legend.margin = margin(0,0,0,0))+
  facet_grid(~flag_value)

10.7 Fig. 2B

## Sanity check for reporting amplicon number 
df <- data.predict
df$MeanRatio_m_SD <- df$MeanRatio - df$MeanRatio_sd

active_amplicons_in_regression_model <- filter(df, Pvalue < 0.05, Residuals_Z_scaled_to_lm > 0)$Amp_name  # 36 amplicons
active_amplicons_in_ratiometric_comparison <- filter(df, MeanRatio > 1)$Amp_name                          # 156 amplicons

#setdiff(active_amplicons_in_regression_model, active_amplicons_in_ratiometric_comparison) # 0 amplicons = complete inclusion of active_amplicons_in_regression_model


# Figure 2B, Fig2B
     
fig_2B_df <- filter(df, Pvalue < 0.05, Residuals_Z_scaled_to_lm > 0, MeanRatio_m_SD > 1.5)
#dim(fig_2B_df)

# Color addition is unnecessary
fig_2B_df$Color <- ifelse(fig_2B_df$Pvalue >= 0.05, "Non-Significant", "Significant (resd. > 0)")
fig_2B_df$Color <- ifelse(fig_2B_df$Pvalue < 0.05 & fig_2B_df$Residuals_Manual < 0, "Significant (resd. < 0)", fig_2B_df$Color)

fig_2B_df <- fig_2B_df[order(fig_2B_df$Residuals_Manual, decreasing = F),]
fig_2B_df$Amp_name <- factor(fig_2B_df$Amp_name, levels = fig_2B_df$Amp_name)

fig_2B_df.melt <- melt(fig_2B_df[,c("Amp_name", "L1_Activity", "L2_Activity", "L3_Activity", "L4_Activity")], id.vars = "Amp_name")

p <- ggplot(fig_2B_df) + 
  geom_bar(data = fig_2B_df, 
           aes(x = Amp_name, y = MeanRatio, fill = Color), 
           stat = "identity", alpha = 0.5, width = 0.75) +
  geom_line(data = fig_2B_df.melt, 
            aes(x = Amp_name, y = value, group = Amp_name), 
            size = 0.25, alpha = 0.5, color = "black") + 
  geom_point(data = fig_2B_df.melt,
             aes(x = Amp_name, y = value), 
             size = 0.5, alpha = 0.75, color = "black", shape = 16) +
  labs(x = "Amplicons",
       y = bquote(Mean(RNA/DNA~Ratio)),
       fill = NULL,
       title = "Amplicons with p < 0.05, Resd. > 0, Mean Activity - SD >  1.5") + 
  theme_bw() + theme(text = element_text(size = 8), 
                     plot.title = element_text(size = 8, hjust = 0.5), 
                     axis.text.x = element_blank(), 
                     axis.ticks.x = element_blank(), 
                     # panel.background = element_rect(fill = "white"), 
                     panel.grid.major.x = element_blank(),
                     panel.grid.minor = element_blank(),
                     # panel.grid.major.y = element_line(size = 0.5),
                     legend.position = "none")+
  scale_fill_manual(values = c("Significant (resd. > 0)" = "blue",
                               "Significant (resd. < 0)" = "red",
                               "Non-Significant" = "grey50"))+
  theme(text = element_text(size = 10), axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))

p
Fig.2B. The top 20 active amplicons with consistent activity across both linear models and ratiometric models. Three amplicons were used for downstream validation in single-candidate deliveries (6_LD, 3_LD, 161_FBDHS). Bar for all plots represents mean RNA/DNA ratio. Each dot represents individual RNA/DNA ratios for each amplicon (4 dots per amplicon).

Fig.2B. The top 20 active amplicons with consistent activity across both linear models and ratiometric models. Three amplicons were used for downstream validation in single-candidate deliveries (6_LD, 3_LD, 161_FBDHS). Bar for all plots represents mean RNA/DNA ratio. Each dot represents individual RNA/DNA ratios for each amplicon (4 dots per amplicon).

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/Fig2B.svg", 
#       plot = p, dpi = 300, width = 4, height = 3)
#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/Fig2B.png", 
#       plot = p, dpi = 300, width = 4, height = 3)

11. Epigenomic intersections

11.1 Roadmap data download

# Original version of this analysis can be found here: 

require("RCurl") 
library(XML)
library(R.utils)

# Sample identity: https://egg2.wustl.edu/roadmap/web_portal/processed_data.html
# E81 - Fetal Brain Male
# E82 - Fetal Brain Female

#setwd("G:/Shared drives/Nord Lab - Computational Projects/MPRA/STAR408/Epigenomics_intersections")

# Creates a subdir for downloading Roadmap peak data
peak_dir <- "Roadmap_consolidated_peaks/"
dir.create(peak_dir, showWarnings = FALSE)

# Downloads data
url_narrowPeak = "https://egg2.wustl.edu/roadmap/data/byFileType/peaks/consolidated/narrowPeak/"

# Get links
result <- getURL(url_narrowPeak, verbose=TRUE, ftp.use.epsv=TRUE, dirlistonly = TRUE)
gz_files <- getHTMLLinks(result,  xpQuery = "//a/@href[contains(., '.gz')]")
download_files <- gz_files[grepl("E081|E082", gz_files)]        # grepl Male and Female fetal samples

# Check if files are present in a directory:
if(all(list.files(peak_dir) %in% gsub(".gz","", download_files))){
  
  message("Files exist")
  
} else {


# Proactively removes all files in the peak_dir
setwd(peak_dir)
file.remove(list.files())
setwd("../")

# Download
sapply(download_files, function(x) 
  download.file(paste0(url_narrowPeak, x), paste0(peak_dir, x)))

# Gunzip
sapply(download_files, function(x) 
  gunzip(paste0(peak_dir, x)))

}

11.2 Merge hg19 coordinates

# Reading and preparing amplicons and hg19 primer coordinate data

# I'm merging primers and star datasets to translate GRCh38 to hg19.
# Primers use hg19 coordinates. star408 amplicons use GRCh38 coordinates. isPCR was also run against GRCh38 genome.

primers <- read.csv("G:/Shared drives/Nord Lab - Computational Projects/MPRA/STAR408/In_silico_PCR_STAR408/STAR408_primers.csv", header = TRUE, fill = TRUE)

primers$Chr <- gsub('.*?([0-9XY]+).*', '\\1', primers$Amplicon.Range)
amp_range <- gsub('.*?[0-9XY]+:(d*)+d*', '\\1', primers$Amplicon.Range)
primers$Start <- gsub('([^+]*).*', '\\1', amp_range, perl = TRUE)
primers$Stop <-gsub('.*\\+([^+]*)', '\\1', amp_range, perl = TRUE)
primers <- primers[,c(12:14, 1:11)]

hg19_coordinates <- primers[,c(1:3, 12)]
#head(hg19_coordinates)

# Readign the final MPRA analysis set
# star408 <- read.csv("G:/Shared drives/Nord Lab - Computational Projects/MPRA/STAR408/CSV_output_files/STAR408_lm_model_complete_dataset_408_amplicons.csv")
# amplicons_309 <- filter(star408, Pass_prop_DNA_and_RNA == TRUE, Pass_DNA_count == TRUE)

#amplicons_321 <- read.csv("G:/Shared drives/Nord Lab - Computational Projects/MPRA/STAR408/CSV_output_files/STAR408_data.predict.N321.csv")

# NOTE: KC. Earlier version was using an expanded set of 321 amplicons. We are reverting to a set of 308 for consistency and stringency.

# Amp_Full_Name
#data.predict$Amp_name

# Linda changed amp names. I'll be matching Amp names in primers / hg19_coordinates so I can merge them.
#hg19_coordinates$Amplicon
hg19_coordinates$Amplicon.ID_new <- gsub("Epilepsy", "EPIL", hg19_coordinates$Amplicon)
hg19_coordinates$Amplicon.ID_new <- gsub("SZ108", "SCZ", hg19_coordinates$Amplicon.ID_new)
hg19_coordinates$Amplicon.ID_new <- gsub("NCASD", "FBDHS", hg19_coordinates$Amplicon.ID_new)
hg19_coordinates$Amplicon.ID_new <- gsub("Controls", "PutEnh", hg19_coordinates$Amplicon.ID_new)

#hg19_coordinates$Amplicon.ID_new

data.predict$Amp_Full_Name <- gsub("Epilepsy", "EPIL", data.predict$Amp_Full_Name)
data.predict$Amp_Full_Name <- gsub("SZ108", "SCZ", data.predict$Amp_Full_Name)
data.predict$Amp_Full_Name <- gsub("NCASD", "FBDHS", data.predict$Amp_Full_Name)
data.predict$Amp_Full_Name <- gsub("Controls", "PutEnh", data.predict$Amp_Full_Name)
data.predict$Amp_Full_Name <- gsub("Control", "PutEnh", data.predict$Amp_Full_Name)

data.predict$Amp_Full_Name <- gsub("252_PutEnh_Arid1b_1", "252_PutEnh", data.predict$Amp_Full_Name)
data.predict$Amp_Full_Name <- gsub("253_PutEnh_NEG1", "253_PutEnh", data.predict$Amp_Full_Name)
data.predict$Amp_Full_Name <- gsub("255_PutEnh_Klhl32_hs676", "255_PutEnh", data.predict$Amp_Full_Name)
data.predict$Amp_Full_Name <- gsub("257_PutEnh_Btrc_hs897", "257_PutEnh", data.predict$Amp_Full_Name)
data.predict$Amp_Full_Name <- gsub("258_PutEnh_Arx_hs122", "258_PutEnh", data.predict$Amp_Full_Name)
data.predict$Amp_Full_Name <- gsub("259_PutEnh_Scn1a", "259_PutEnh", data.predict$Amp_Full_Name)
data.predict$Amp_Full_Name <- gsub("264_PutEnh_NEG4", "264_PutEnh", data.predict$Amp_Full_Name)
data.predict$Amp_Full_Name <- gsub("266_PutEnh_Etv1_hs550", "266_PutEnh", data.predict$Amp_Full_Name)

# Arid1b_1 and Arid1b_3 are overlapping. Dropping Arid1b_3.
hg19_coordinates <- hg19_coordinates[!hg19_coordinates$Amplicon.ID_new == "265_PutEnh",]

#data.frame(data.predict$Amp_name, data.predict$Amp_name2, data.predict$Amp_Full_Name)

#setdiff(hg19_coordinates$Amplicon.ID_new, data.predict$Amp_Full_Name) # 99
#setdiff(data.predict$Amp_Full_Name, hg19_coordinates$Amplicon.ID_new) # 0 - fixed



#head(hg19_coordinates)
#head(data.predict)
colnames(hg19_coordinates) <- c("Chr_hg19", "Start_hg19", "Stop_hg19", "Amplicon.ID", "Amplicon.ID_new")

data.predict_w_hg19_coord <- merge(hg19_coordinates, data.predict, 
                                    by.x = "Amplicon.ID_new", by.y = "Amp_Full_Name")

colnames(data.predict_w_hg19_coord) <-  gsub("\\<Start\\>","Start_GRCh38", colnames(data.predict_w_hg19_coord))
colnames(data.predict_w_hg19_coord) <-  gsub("\\<End\\>","End_GRCh38", colnames(data.predict_w_hg19_coord))
colnames(data.predict_w_hg19_coord) <-  gsub("\\<Chr\\>","Chr_GRCh38", colnames(data.predict_w_hg19_coord))

#colnames(data.predict_w_hg19_coord) 

#nrow(data.predict_w_hg19_coord)
#nrow(hg19_coordinates)
#nrow(data.predict)

# Generating MPRA GR object
data.predict_w_hg19_coord <- data.predict_w_hg19_coord[,c(2:4, 7:9, 1,5, 10:66)]
colnames(data.predict_w_hg19_coord) <- c("Chr", "Start", "End", colnames(data.predict_w_hg19_coord)[4:65])


data.predict_w_hg19_coord$Chr <- paste0("chr", data.predict_w_hg19_coord$Chr) # For compatibility
data.predict_w_hg19_coord_GR <- makeGRangesFromDataFrame(data.predict_w_hg19_coord)
values(data.predict_w_hg19_coord_GR) <- data.predict_w_hg19_coord[,4:65]

11.3 Calculate intersections

source("epi_mark_intersection.R")
library(data.table)

# Calculate interactions
calculate_interactions <- function(samples, epi_marks, q_values){
# samples <- c("E081", "E082")
# epi_marks <-  c("DNase.macs2", "H3K27me3", "H3K36me3", "H3K4me1", "H3K4me3", "H3K9me3")
# q_values <-   c(1, 0.05, 0.01)

l <- lapply(epi_marks, function(y){
lapply(q_values, function(x){
  
  epi_mark_intersection(y, samples, x)
  
  })  
})

df <- rbindlist(lapply(1:length(l), function(x) rbindlist(l[[x]])))  # there must be a better way...

df$Epi_mark <- gsub("DNase.macs2", "DNase", df$Epi_mark)

df$Significance <- ifelse(df$P_values_perm >= 0.05, "Non-significant", "NA")
df$Significance <- ifelse(df$P_values_perm < 0.05, "P < 0.05", df$Significance)


df_m <- melt(df, id.vars = c("Condition", "Epi_mark", "Epi_peak_q_value", "Fraction_intersecting", "P_values_perm", "Significance"))

df_m <- arrange(df_m, desc(Epi_peak_q_value))
df_m$Epi_peak_q_value <- factor(df_m$Epi_peak_q_value, levels = unique(as.numeric(df_m$Epi_peak_q_value)))

# Peak mark order
df_m$Epi_mark <- factor(df_m$Epi_mark, levels = c("DNase", "H3K27ac", "H3K9ac", "H3K4me1", "H3K4me3", "H3K36me3", "H3K9me3", "H3K27me3"))

df_m$Significance <- factor(df_m$Significance, levels = c("P < 0.05","Non-significant"))

df_m

}

# Male and Female "E081", "E082" datasets are merged together in this approach. Intersection indicates overlap between any of the samples.


# This is loaded from a file to speed up report generation
if(file.exists("Epi_interst.RData")){
  
  load("Epi_interst.RData")
  
} else {
  
df_m <- calculate_interactions(c("E081", "E082"), 
                               c("DNase.macs2", "H3K27me3", "H3K36me3", "H3K4me1", "H3K4me3", "H3K9me3"), 
                               c(1, 0.05, 0.01))

#save(df_m, file = "Epi_interst.RData")    
  
}

11.4 Intersections across peak q values

# Intersections across Peak q values
title = "Fetal Brain Male and Female"

ggplot(df_m, aes(x = Epi_peak_q_value, y = Fraction_intersecting, color = Condition, group = Condition, 
                 shape = Significance))+
  geom_point(size = 3, alpha = 0.3)+
  geom_line()+
  labs(x = "Roadmap epigenomic MACS2 peak q value threshold", y = paste0("Fraction of amplicons overlapping with epigenomic peaks"))+
  expand_limits(y = 0)+
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5))+
  facet_grid(~Epi_mark)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))+
  labs(title = title)+
  labs(shape='Significance of enrichment \n (Group enrichment, \n permutation test)')+
  labs(color='Amplicon residual activity')+
  scale_color_manual(values=c("Significant" = "#F8766D", 
                              "Non_significant" = "grey"))+
  guides(
   shape = guide_legend(order = 2),
   color = guide_legend(order = 1)
  )+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

# Stacked barplot representation

df2 <- df_m
df2$Fraction_not_intersecting <- 1- df2$Fraction_intersecting
df2 <- df2[,c("Condition", "Epi_mark", "Fraction_intersecting", "Fraction_not_intersecting",  "Epi_peak_q_value")]

df2_m <- melt(df2, id.vars = c("Condition", "Epi_mark", "Epi_peak_q_value"))

df2_m$Epi_mark <- factor(df2_m$Epi_mark, levels = c("DNase", "H3K4me1", "H3K4me3", "H3K36me3", "H3K9me3", "H3K27me3"))
df2_m$Condition <- factor(df2_m$Condition, levels = c("Significant", "Non_significant"))

df2_m <- distinct(df2_m)

### Stacked barplot - all peak significance levels
ggplot(df2_m, aes(fill = variable, y=value, x=Condition)) + 
    geom_bar(position="stack", stat="identity")+
    theme_bw()+
    labs(x = "", y = "Fraction of MPRA amplicons with overlapping FB DHS peaks")+
    theme(legend.title = element_blank())+
    facet_grid(Epi_peak_q_value ~ Epi_mark, margins = FALSE)+
    theme(axis.text.x = element_text(angle = 90))

11.6 Fig.2C Fetal Brain Intersections at q<0.05

# Only at q < 0.05 roadmap peak
df3_m <- dplyr::filter(df2_m, Epi_peak_q_value == 0.05) 

df3_m$variable <- ifelse(df3_m$variable == "Fraction_intersecting", "Intersecting", "Non-intersecting")

fill <- c("#b2d183", "#40b8d0")
  
Fig_2C <- ggplot(df3_m, aes(fill = variable, y=value, x=Condition)) + 
    geom_bar(position="stack", stat="identity")+
    theme_bw()+
    labs(x = "", 
         y = "Fraction of MPRA amplicons overlapping \n with epigenomic datasets")+
    labs(fill = "MPRA amplicon:")+  
    facet_wrap(~Epi_mark)+
    scale_fill_manual(values=fill)+
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
    theme(strip.text.x = element_text(size = 14))+
    theme(legend.position="bottom")

Fig_2C
Fig.2C Amplicon intersection with fetal brain epigenomic datasets from the Roadmap Epigenomics Project. Amplicons were divided into two groups based on the statistical significance of their activity in the MPRA.

Fig.2C Amplicon intersection with fetal brain epigenomic datasets from the Roadmap Epigenomics Project. Amplicons were divided into two groups based on the statistical significance of their activity in the MPRA.

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/Fig2C.svg", 
#       plot = Fig_2C, dpi = 300, width = 7, height = 6)
#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/Fig2C.png", 
#       plot = Fig_2C, dpi = 300, width = 7, height = 6)

11.6.1 Enrichment P values

Enrichment with permutation test at P < 0.01 and peak q value at 0.05

epi_p_table_2C <- distinct(filter(df_m, P_values_perm < 0.1, Epi_peak_q_value == 0.05)[,c("Condition", "Epi_mark", "Epi_peak_q_value", "Fraction_intersecting", "P_values_perm", "Significance")])

knitr::kable(arrange(epi_p_table_2C, Epi_mark))
Condition Epi_mark Epi_peak_q_value Fraction_intersecting P_values_perm Significance
Significant DNase 0.05 0.4878049 0.0354835 P < 0.05
Significant H3K4me1 0.05 0.4390244 0.0274045 P < 0.05
Significant H3K4me3 0.05 0.1707317 0.0131099 P < 0.05

11.7 Fig.2D Human Epi. Intersections at q<0.05

# Yurong Wang calculated epigenomic intersections using a set of 321 MPRA amplicons. In a revised version of this analysis swe decided to limit the reported set to 308

# Move this file up in the file structure
y_data <- read.csv("G:/Shared drives/Nord Lab - Computational Projects/MPRA/STAR408/Epigenomics_intersections/STAR408_cleaned_dataset_TF_PhastCons_ATAC_321_amplicons.csv")

#setdiff(data.predict$Amp_name2, y_data$Amp_name2) # the set of 321 includes the set of 308.
#setdiff(y_data$Amp_name2, data.predict$Amp_name2) # 13 extra amplicons in a set of 321

y_data <- dplyr::filter(y_data, Amp_name2 %in% data.predict$Amp_name2)

#colnames(y_data)
#dim(y_data)

# In case there are any differences in the model parameters between Yurong's intersections and our current model, I'm re merging the two datasets.

y_data <- merge(data.predict, y_data[,c(47, 67:116)], by = "Amp_name2")


# Calculating intersections
y_data$Act_group <- ifelse(y_data$Pvalue >= 0.05, "Non_significant", NA)
y_data$Act_group <- ifelse(y_data$Pvalue < 0.05, "Significant", y_data$Act_group)


yurong_intersection <- function(variable_of_interest){
 
#variable_of_interest = "ovlp_TF_fBrain"
  
df <- y_data[,c("Amp_Full_Name" , variable_of_interest, "Pvalue",  "Residuals_Z_scaled_to_lm", "Act_group")]

df$intersecting <- df[,2] > 0 

library(tidyverse)
intersection_counts <- df %>% group_by(Act_group) %>% count(intersecting)
intersection_counts <- as.data.frame(intersection_counts)

colnames(intersection_counts)[3] <- "counts"

a <- subset(intersection_counts, intersecting == TRUE)
b <- subset(intersection_counts, intersecting == FALSE)

colnames(a)[3] <- "Intersecting_amplicons"
colnames(b)[3] <- "Non-intersecting_amplicons"

df <- data.frame(a[,c(1,3)], b[,3])
colnames(df) <- c("Act_group", "Intersecting_amplicons", "Non-intersecting_amplicons")

df$Amplicons_in_Act_group <- reshape2::melt(table(y_data$Act_group))$value
df$n_of_all_amplicons <- nrow(y_data)

df$Fraction_intersecting <- df$Intersecting_amplicons / df$Amplicons_in_Act_group
df$Fraction_Non_intersecting <- df$`Non-intersecting_amplicons` / df$Amplicons_in_Act_group

df$Epi_mark <- variable_of_interest

#df

### P values ####

# Reduced Yurong's dataset to columns of interest
y_data <- y_data[,c("Amp_Full_Name", "Act_group", 
                    "ovlp_TF_fBrain", "ovlp_TF_fLung", 
                    "ovlp_TF_K562", "ovlp_ATAC_neuron_STAR408_result_ovlp_ATAC_merged",
                    "ovlp_ATAC_glia_STAR408_result_ovlp_ATAC_merged", "ovlp_consEl_all")]

# Indicating intersections. I think the numerical values represent the number of intersecting features?? 
# Yurong has the original code 
y_d <- cbind(y_data[,c(1:2)], as.data.frame(as.matrix(y_data[,c(3:8)]) > 0))

# Ns of amplicons in each activity group
n_of_Non_significant <-  sum(y_d$Act_group == "Non_significant")
n_of_Significant <- sum(y_d$Act_group == "Significant")

y_d_Non_significant <- dplyr::filter(y_d, Act_group == "Non_significant")
y_d_Significant <- dplyr::filter(y_d, Act_group == "Significant")


n_of_Non_significant_intersect <- sum(y_d_Non_significant[,variable_of_interest]) 
n_of_Significant_intersect <- sum(y_d_Significant[,variable_of_interest]) 


all_intersecting <- sum(n_of_Non_significant_intersect, n_of_Significant_intersect)
all_amp <- nrow(data.predict)
    
# Random vector or 0s and 1s, all_amp values, with the number of ones =  all_intersecting
seed_value <- 1234

set.seed(seed_value)
random_vector <- sample(c(rep(1, all_intersecting), rep(0, all_amp - all_intersecting)))

# Permutation test Non significant group
set.seed(seed_value)

n_of_perm <- 20000

set.seed(seed_value)
NS_perm <- replicate(n_of_perm, sum(sample(random_vector, n_of_Non_significant, FALSE)))
n_of_NS_scaled <- (n_of_Non_significant_intersect - mean(NS_perm)) / sd(NS_perm)
NS_p <- pnorm(n_of_NS_scaled, lower.tail=FALSE)

# Permutation test Significant group
set.seed(seed_value)
S_perm <- replicate(n_of_perm, sum(sample(random_vector, n_of_Significant, FALSE)))
n_of_S_scaled <- (n_of_Significant_intersect - mean(S_perm)) / sd(S_perm)
S_p <- pnorm(n_of_S_scaled, lower.tail=FALSE)

df$P_values_perm <- c(NS_p, S_p)
df

}

df <- rbind(
yurong_intersection("ovlp_TF_fBrain"),
yurong_intersection("ovlp_TF_fLung"),
yurong_intersection("ovlp_TF_K562"),
yurong_intersection("ovlp_ATAC_neuron_STAR408_result_ovlp_ATAC_merged"),
yurong_intersection("ovlp_ATAC_glia_STAR408_result_ovlp_ATAC_merged"),
yurong_intersection("ovlp_consEl_all")
)


df$Epi_mark <- ifelse(df$Epi_mark == "ovlp_TF_fBrain", "Fetal Brain TF Footprints", df$Epi_mark)
df$Epi_mark <- ifelse(df$Epi_mark == "ovlp_TF_fLung", "Fetal Lung TF Footprints", df$Epi_mark)
df$Epi_mark <- ifelse(df$Epi_mark == "ovlp_TF_K562", "K562 cells TF Footprints", df$Epi_mark)
df$Epi_mark <- ifelse(df$Epi_mark == "ovlp_ATAC_neuron_STAR408_result_ovlp_ATAC_merged", 
                      "ATAC-seq, Human Neurons", df$Epi_mark)
df$Epi_mark <- ifelse(df$Epi_mark == "ovlp_ATAC_glia_STAR408_result_ovlp_ATAC_merged", 
                      "ATAC-seq, Human Glia", df$Epi_mark)
df$Epi_mark <- ifelse(df$Epi_mark == "ovlp_consEl_all", 
                      "Conserved Element", df$Epi_mark)



df_m <- melt(df[,c("Act_group", "Epi_mark", "Fraction_intersecting", "Fraction_Non_intersecting", "P_values_perm")], 
             id.vars = c("Act_group", "Epi_mark", "P_values_perm"))

df_m$Act_group <- factor(df_m$Act_group, levels = c("Significant", "Non_significant"))

df_m$Epi_mark <- factor(df_m$Epi_mark, levels = c("ATAC-seq, Human Neurons", "ATAC-seq, Human Glia", "Conserved Element", "Fetal Brain TF Footprints", "Fetal Lung TF Footprints", "K562 cells TF Footprints"))

#Plot

fill <- c("#b2d183", "#40b8d0")

Fig_2D <- ggplot(df_m, aes(x = Act_group, y=value, fill=variable)) + 
    geom_bar(position="stack", stat="identity")+
    theme_bw()+
    labs(x = "Amplicon residual activity", 
         y = "Fraction of MPRA amplicons overlapping \n with epigenomic datasets")+
    labs(fill = "MPRA amplicon:")+
    scale_fill_manual(values=fill)+
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
    facet_wrap(~Epi_mark)+
    theme(strip.text.x = element_text(size = 14))+
    theme(legend.position="bottom")



Fig_2D      
Fig.2D The same amplicon intersection analysis with various human epigenomics datasets and vertebrate conserved elements, see text for details of intersection datasets.

Fig.2D The same amplicon intersection analysis with various human epigenomics datasets and vertebrate conserved elements, see text for details of intersection datasets.

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft Figures/Figure 2/Fig_2D.svg", Fig_2D, width = 8, height = 5, dpi = 300, units = "in")

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft Figures/Figure 2/Fig_2D.png", Fig_2D, width = 8, height = 5, dpi = 300, units = "in")

#distinct(filter(df_m, P_values_perm < 0.3)[,colnames(df_m)[1:3]])

### set.seed(1234) ###
#                            P value        N of permutations
# ATAC-seq, Human Neurons    0.05139055   - 5,000
# ATAC-seq, Human Neurons    0.05081277   - 10,000 
# ATAC-seq, Human Neurons    0.04968570   - 20,000  - This is in the manuscript
# ATAC-seq, Human Neurons    0.04967320   - 40,000 
# ATAC-seq, Human Neurons    0.04887011   - 60,000
# ATAC-seq, Human Neurons    0.04881656   - 80,000
# ATAC-seq, Human Neurons    0.04850609   - 100,000
# ATAC-seq, Human Neurons    0.04856889   - 200,000
# ATAC-seq, Human Neurons    0.04893161   - 300,000

### set.seed(1) ###
#                            P value        N of permutations
# ATAC-seq, Human Neurons    0.04721183   - 5,000
# ATAC-seq, Human Neurons    0.04990434   - 10,000 
# ATAC-seq, Human Neurons    0.04946205   - 20,000
# ATAC-seq, Human Neurons    0.04999724   - 40,000
# ATAC-seq, Human Neurons    0.04979448   - 60,000
# ATAC-seq, Human Neurons    0.04970909   - 80,000
# ATAC-seq, Human Neurons    0.04929078   - 100,000
# ATAC-seq, Human Neurons    0.04850799   - 200,000
# ATAC-seq, Human Neurons    0.04864392   - 300,000
# Saving C and D figures
library(egg)

fill <- c("#b2d183", "#40b8d0")

# Condition names edits  
df3_m$Condition <- ifelse(df3_m$Condition == "Significant", "Sig.", "Non_sig.")
df3_m$Condition <- factor(df3_m$Condition, levels = c("Sig.", "Non_sig."))

df_m$Act_group <- ifelse(df_m$Act_group == "Significant", "Sig.", "Non_sig.")
df_m$Act_group <- factor(df_m$Act_group, levels = c("Sig.", "Non_sig."))

# This is for adjusting/matching strip.text margins. Plotting all 12 panels from a single ggplot call would have solved this problem.
m_unit = 0.15
factor = 0.8

Fig_2C <- ggplot(df3_m, aes(fill = variable, y=value, x=Condition)) + 
    geom_bar(position="stack", stat="identity")+
    theme_bw()+
    labs(x = "", 
         y = "")+
    labs(fill = "MPRA amplicon:")+  
    facet_wrap(~Epi_mark)+
    scale_fill_manual(values=fill)+
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
    theme(strip.text.x = element_text(size = 7, margin = margin(m_unit*factor, 
                                                                m_unit*factor, 
                                                                m_unit*factor, 
                                                                m_unit*factor, "in")))+
    theme(legend.position="bottom")+
    theme(text = element_text(size=6),
          axis.text = element_text(size=6))

Fig_2D <- ggplot(df_m, aes(x = Act_group, y=value, fill=variable)) + 
    geom_bar(position="stack", stat="identity")+
    theme_bw()+
    labs(x = "", 
         y = "")+
    labs(fill = "MPRA amplicon:")+
    scale_fill_manual(values=fill)+
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
    facet_wrap(~Epi_mark, labeller = label_wrap_gen(width=15))+
    theme(legend.position="none")+
    theme(strip.text.x = element_text(size = 7, margin = margin(0.1*factor, 
                                                                0.1*factor, 
                                                                0.1*factor, 
                                                                0.1*factor, "in")))+
    theme(text = element_text(size=6),
          axis.text = element_text(size=6))



#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/Fig_2CD.svg", ggarrange(Fig_2C, Fig_2D, widths = c(1,1), newpage = FALSE),
#       width = 3.7*2, height = 3.5, dpi = 300, units = "in")

#ggsave("G:/Shared drives/Nord Lab - Temp Overflow/STARR-408_Lambert&Su-Feher/Draft #Figures/Figures_eLIFE_KC/Fig_2CD.png", ggarrange(Fig_2C, Fig_2D, widths = c(1,1), newpage = FALSE),
#       width = 3.7*2, height = 3.5, dpi = 300, units = "in")

11.7.1 Enrichment P values

Enrichment with permutation test at P < 0.05 and peak q value at 0.05

epi_p_table_2D <- distinct(filter(df_m, P_values_perm < 0.3)[,colnames(df_m)[1:3]])

knitr::kable(arrange(epi_p_table_2D, Epi_mark))
Act_group Epi_mark P_values_perm
Significant ATAC-seq, Human Neurons 0.0496857
Significant ATAC-seq, Human Glia 0.0158962
Significant Fetal Brain TF Footprints 0.2359747

12. Values reported in the text

sum(data.all$Maxi_counts > 200)
## [1] 345

“We generated an MPRA library consisting of 345 candidate human DNA sequences for testing, each ~900 bp in size, to assess functional enhancer activity in vivo in early postnatal mouse brain.”


min_count = 200
n_of_genomic_AAV <- sum(rowSums(data.all[,c("L1_DNA_count", "L2_DNA_count", "L3_DNA_count", "L4_DNA_count")]>min_count) >=4)

n_of_genomic_AAV
## [1] 321

“We observed strong correlation between amplicon read counts in the pooled previral plasmid library (“Library”) and genomic AAV DNA (“DNA”) collected from each injected P7 brain (Figure 1B), with 321 /345 (93%) detected in the pooled previral library with >200 aligned reads present in all four DNA replicates."


sum(data.predict$training)
## [1] 248

“Then, we used the middle 80% of amplicons (N = 248) ranked by RNA/DNA ratio (Supplemental Figure 4B) to build a linear model to account for background MPRA RNA based on amplicon representation and GC content (Supplemental Figure 4C-F).”


sum(data.predict$Pvalue < 0.05)
## [1] 41
sum(data.predict$FDR < 0.05)
## [1] 0
sum(data.predict$Pvalue < 0.05 & data.predict$Residuals_Z_scaled_to_lm > 0)
## [1] 41
sum(data.predict$FDR < 0.05 & data.predict$Residuals_Z_scaled_to_lm > 0)
## [1] 0
sum(data.predict$Pvalue < 0.05 & data.predict$Residuals_Z_scaled_to_lm < 0)
## [1] 0
sum(data.predict$FDR < 0.05 & data.predict$Residuals_Z_scaled_to_lm < 0)
## [1] 0

“We identified 41 amplicons with significant changes in MPRA RNA compared to input DNA (P < 0.05), including 0 amplicons passing a false discovery rate threshold (FDR < 0.05). 41 at P < 0.05 and 0 at FDR < 0.05 amplicons (12% and 2%) with increased RNA, suggestive of positive transcriptional regulatory activity, i.e. enhancers. 0 at P < 0.05 and 0 at FDR < 0.05 amplicons had significantly reduced RNA versus expected, representing the set of amplicons that are least likely to have enhancer activity (Figure 2A).”


sum(data.predict$MeanRatio > 1.5 & data.predict$MSD > 0)
## [1] 71

We found that 71 amplicons were considered active using the criteria RNA/DNA ratio > 1.5 including amplicons with mean greater than standard deviation.


active_amp_by_lm <- data.predict$Pvalue < 0.05       # 41
active_amp_by_ratio <- data.predict$MeanRatio > 1.5  # 98

# It's more intuitive to run this on amp names instead of Booleans 
active_amp_by_lm <- data.predict$Amp_name[active_amp_by_lm]  # 41
active_amp_by_ratio <- data.predict$Amp_name[active_amp_by_ratio]  #98

setdiff(active_amp_by_lm, active_amp_by_ratio) # "331_GWAS" - is the exception
## [1] "331_GWAS"
#setdiff(active_amp_by_ratio, active_amp_by_lm)


# There are actually 40/41 amplicons identified as active using both approaches 
round(40/41, digits = 2)
## [1] 0.98

35 out of 36 amplicons identified using model residuals were also active in the ratiometric comparison.


sum(data.predict$Pvalue < 0.05)
## [1] 41
sum(data.predict$Pvalue < 0.05 & 
      data.predict$MSD > 1.5)
## [1] 20

Among 41 positive amplicons with significant regression residual activity (P < 0.05), 20 (49%) had mean ratio – 1 s.d. > 1.5 across individual replicates, representing the amplicons with the strongest and most consistent MPRA-defined enhancer activity (Figure 2B).


13. R sessionInfo()

library(pander)

pander(sessionInfo())

R version 4.0.2 (2020-06-22)

Platform: x86_64-w64-mingw32/x64 (64-bit)

locale: LC_COLLATE=English_United States.1252, LC_CTYPE=English_United States.1252, LC_MONETARY=English_United States.1252, LC_NUMERIC=C and LC_TIME=English_United States.1252

attached base packages: grid, parallel, stats4, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: pander(v.0.6.3), R.utils(v.2.10.1), R.oo(v.1.24.0), R.methodsS3(v.1.8.1), XML(v.3.99-0.3), RCurl(v.1.98-1.2), gridExtra(v.2.3), Vennerable(v.3.0), xtable(v.1.8-4), gtools(v.3.8.2), reshape(v.0.8.8), lattice(v.0.20-41), RBGL(v.1.64.0), graph(v.1.66.0), pheatmap(v.1.0.12), GenomicRanges(v.1.40.0), GenomeInfoDb(v.1.24.2), IRanges(v.2.22.2), S4Vectors(v.0.26.1), BiocGenerics(v.0.34.0), DT(v.0.16), plotly(v.4.9.2.2), RColorBrewer(v.1.1-2), ggpmisc(v.0.3.7), data.table(v.1.12.8), reshape2(v.1.4.4), genefilter(v.1.70.0), GGally(v.2.0.0), forcats(v.0.5.0), stringr(v.1.4.0), dplyr(v.1.0.0), purrr(v.0.3.4), readr(v.1.3.1), tidyr(v.1.1.0), tibble(v.3.0.1), ggplot2(v.3.3.2) and tidyverse(v.1.3.0)

loaded via a namespace (and not attached): nlme(v.3.1-148), bitops(v.1.0-6), fs(v.1.4.2), lubridate(v.1.7.9), bit64(v.4.0.5), httr(v.1.4.2), tools(v.4.0.2), backports(v.1.1.7), R6(v.2.5.0), mgcv(v.1.8-31), DBI(v.1.1.0), lazyeval(v.0.2.2), colorspace(v.1.4-1), withr(v.2.3.0), tidyselect(v.1.1.0), bit(v.4.0.4), compiler(v.4.0.2), cli(v.2.2.0), rvest(v.0.3.6), Biobase(v.2.48.0), xml2(v.1.3.2), labeling(v.0.4.2), scales(v.1.1.1), digest(v.0.6.25), rmarkdown(v.2.6), XVector(v.0.28.0), pkgconfig(v.2.0.3), htmltools(v.0.5.0), highr(v.0.8), dbplyr(v.2.0.0), htmlwidgets(v.1.5.3), rlang(v.0.4.9), readxl(v.1.3.1), rstudioapi(v.0.13), RSQLite(v.2.2.1), farver(v.2.0.3), generics(v.0.1.0), jsonlite(v.1.7.2), crosstalk(v.1.1.0.1), magrittr(v.2.0.1), polynom(v.1.4-0), GenomeInfoDbData(v.1.2.3), Matrix(v.1.2-18), Rcpp(v.1.0.6), munsell(v.0.5.0), fansi(v.0.4.1), lifecycle(v.0.2.0), stringi(v.1.4.6), yaml(v.2.2.1), zlibbioc(v.1.34.0), plyr(v.1.8.6), blob(v.1.2.1), crayon(v.1.3.4), haven(v.2.3.1), splines(v.4.0.2), annotate(v.1.66.0), hms(v.0.5.3), knitr(v.1.30), pillar(v.1.4.7), reprex(v.0.3.0), glue(v.1.4.1), evaluate(v.0.14), modelr(v.0.1.8), vctrs(v.0.3.0), cellranger(v.1.1.0), gtable(v.0.3.0), assertthat(v.0.2.1), xfun(v.0.15), broom(v.0.7.3), survival(v.3.1-12), viridisLite(v.0.3.0), AnnotationDbi(v.1.50.3), memoise(v.1.1.0) and ellipsis(v.0.3.1)